/* * IO submission data structure (Submission Queue Entry) */ structio_uring_sqe { __u8 opcode; /* type of operation for this sqe */ __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ __u64 addr; /* pointer to buffer or iovecs */ __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; __u32 fsync_flags; __u16 poll_events; __u32 sync_range_flags; __u32 msg_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { __u16 buf_index; /* index into fixed buffers, if used */ __u64 __pad2[3]; }; }; /* * IO completion data structure (Completion Queue Entry) */ structio_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ __s32 res; /* result code for this event */ __u32 flags; };
structio_sq_ring { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * * The kernel controls head and the application controls tail. */ structio_uringr; /* * Bitmask to apply to head and tail offsets (constant, equals * ring_entries - 1) */ u32 ring_mask; /* Ring size (constant, power of 2) */ u32 ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array * * Written by the kernel, shouldn't be modified by the * application (i.e. get number of "new events" by comparing to * cached value). * * After a new SQ head value was read by the application this * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ u32 dropped; /* * Runtime flags * * Written by the kernel, shouldn't be modified by the * application. * * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ u32 flags; /* * Ring buffer of indices into array of io_uring_sqe, which is * mmapped by the application using the IORING_OFF_SQES offset. * * This indirection could e.g. be used to assign fixed * io_uring_sqe entries to operations and only submit them to * the queue when needed. * * The kernel modifies neither the indices array nor the entries * array. */ u32 array[]; };
/* * This data is shared with the application through the mmap at offset * IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_cqring_offsets when calling io_uring_setup. */ structio_cq_ring { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * * The application controls head and the kernel tail. */ structio_uringr; /* * Bitmask to apply to head and tail offsets (constant, equals * ring_entries - 1) */ u32 ring_mask; /* Ring size (constant, power of 2) */ u32 ring_entries; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure * there are not more requests pending thatn there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the * application (i.e. get number of "new events" by comparing to * cached value). * * As completion events come in out of order this counter is not * ordered with any other data. */ u32 overflow; /* * Ring buffer of completion events. * * The kernel writes completion events fresh every time they are * produced, so the application is allowed to modify pending * entries. */ structio_uring_cqecqes[]; };
/* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ structfile **user_files; unsigned nr_user_files;
/* if used, fixed mapped user buffers */ unsigned nr_user_bufs; structio_mapped_ubuf *user_bufs;
struct { spinlock_t completion_lock; bool poll_multi_file; /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ structlist_headpoll_list; structlist_headcancel_list; } ____cacheline_aligned_in_smp;
SYSCALL_DEFINE6(io_uring_enter, unsignedint, fd, u32, to_submit, u32, min_complete, u32, flags, constsigset_t __user *, sig, size_t, sigsz) { structio_ring_ctx *ctx; long ret = -EBADF; int submitted = 0; structfdf; //... /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; goto out_ctx; }
ret = 0; if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); if (ctx->flags & IORING_SETUP_IOPOLL) { ret = io_iopoll_check(ctx, &nr_events, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } }
if (refcount_dec_and_test(&req->refs)) { /* If we're not using fixed files, we have to pair the * completion part with the file put. Use regular * completions for those, only batch free for fixed * file and non-linked commands. */ if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); } else { io_free_req(req); } } }
tail = ctx->cached_cq_tail; /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) returnNULL;
... For peak performance, io_uring helps us get to 1.7M 4k IOPS with polling. aio reaches a performance cliff much lower than that, at 608K. The comparison here isn't quite fair, since aio doesn't support polled IO. If we disable polling, io_uring is able to drive about 1.2M IOPS for the (otherwise) same test case.