pipe: Use a linked-list instead of a ring buffer
Use a linked-list of pipe_buffers rather than a ring, allocating them as we
need them. We cache one on pipe_inode_info struct for fast use by ordinary
pipe writes, just as we can cache a spare page.
Doing this will allow the pipe_buffer to have an integral variable-sized
bio_vec array pointing to the content of the buffer, allowing a buffer to
point to multiple folios.
Having a pipe_buffer that can point to multiple pages then allows splice to
append an entire splice segment consisting of multipe pages in a single
pipe buffer.
The pipe buffer bvec can then be passed directly to, say, sendmsg() with
MSG_SPLICE_PAGES when splicing from the pipe, allowing the socket to be a
bit more efficient.
Signed-off-by: David Howells <dhowells@redhat.com>
diff --git a/fs/internal.h b/fs/internal.h
index f7a74cc..8508f85 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -178,6 +178,7 @@ extern void shrink_dentry_list(struct list_head *);
extern const struct file_operations pipefifo_fops;
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
+void wakeup_pipe_readers(struct pipe_inode_info *pipe);
/*
* fs_pin.c
diff --git a/fs/pipe.c b/fs/pipe.c
index 4427903..5473528 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -129,31 +129,43 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
}
}
+void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+ smp_mb();
+ if (waitqueue_active(&pipe->rd_wait))
+ wake_up_interruptible(&pipe->rd_wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ unsigned int i;
- /*
- * If nobody else uses this page, and we don't already have a
- * temporary page, let's keep track of it as a one-deep
- * allocation cache. (Otherwise just release our reference to it)
- */
- if (page_count(page) == 1 && !pipe->tmp_page)
- pipe->tmp_page = page;
- else
- put_page(page);
+ for (i = 0; i < buf->nr; i++) {
+ struct folio *folio = buf->bvec[i].bv_folio;
+
+ /*
+ * If nobody else uses this page, and we don't already have a
+ * temporary page, let's keep track of it as a one-deep
+ * allocation cache. (Otherwise just release our reference to it)
+ */
+ if (folio_ref_count(folio) == 1 && !pipe->spare_folio)
+ pipe->spare_folio = buf->bvec[i].bv_folio;
+ else
+ folio_put(buf->bvec[i].bv_folio);
+ }
}
static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = buf->bvec[buf->index].bv_folio;
- if (page_count(page) != 1)
+ if (folio_ref_count(folio) != 1)
return false;
- memcg_kmem_uncharge_page(page, 0);
- __SetPageLocked(page);
+ memcg_kmem_uncharge_page(folio_page(folio, 0), 0);
+ __folio_lock(folio);
return true;
}
@@ -172,15 +184,15 @@ static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = buf->bvec[buf->index].bv_folio;
/*
* A reference of one is golden, that means that the owner of this
* page is the only one holding a reference to it. lock the page
* and return OK.
*/
- if (page_count(page) == 1) {
- lock_page(page);
+ if (folio_ref_count(folio) == 1) {
+ __folio_lock(folio);
return true;
}
return false;
@@ -199,7 +211,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- return try_get_page(buf->page);
+ return folio_try_get(buf->bvec[buf->index].bv_folio);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
@@ -214,7 +226,10 @@ EXPORT_SYMBOL(generic_pipe_buf_get);
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- put_page(buf->page);
+ unsigned int i;
+
+ for (i = 0; i < buf->nr; i++)
+ folio_put(buf->bvec[i].bv_folio);
}
EXPORT_SYMBOL(generic_pipe_buf_release);
@@ -238,17 +253,19 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
*/
size_t pipe_query_space(struct pipe_inode_info *pipe, size_t *len, int *error)
{
- size_t used = pipe_occupancy(pipe->head, pipe->tail);
- size_t npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ size_t npages;
if (unlikely(!pipe->readers)) {
send_sig(SIGPIPE, current, 0);
*error = -EPIPE;
return 0;
}
-
- if (npages == 0)
+ if (pipe->footprint >= pipe->max_footprint) {
*error = -EAGAIN;
+ return 0;
+ }
+
+ npages = pipe->max_footprint - pipe->footprint;
*len = min_t(size_t, *len, npages * PAGE_SIZE);
return npages;
}
@@ -264,17 +281,8 @@ EXPORT_SYMBOL(pipe_query_space);
*/
size_t pipe_query_content(struct pipe_inode_info *pipe, size_t *len)
{
- unsigned int head = pipe->head;
- unsigned int tail = pipe->tail;
- size_t size = 0, used = pipe_occupancy(head, tail);
-
- while (!pipe_empty(head, tail)) {
- size += pipe_buf(pipe, tail)->len;
- tail++;
- }
-
- *len = size;
- return used;
+ *len = pipe->content;
+ return pipe->footprint;
}
EXPORT_SYMBOL(pipe_query_content);
@@ -296,12 +304,41 @@ struct pipe_buffer *pipe_alloc_buffer(struct pipe_inode_info *pipe,
size_t bvcount, gfp_t gfp, int *error)
{
struct pipe_buffer *buf;
+ size_t size = struct_size(buf, bvec, bvcount);
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_full(pipe))
return NULL;
- buf = pipe_head_buf(pipe);
- memset(buf, 0, sizeof(*buf));
- buf->ops = ops;
+
+ if (bvcount < 1)
+ bvcount = 1;
+
+ if (pipe->spare_buffer) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+ buf = pipe->spare_buffer;
+ if (buf) {
+ if (buf->max >= bvcount)
+ pipe->spare_buffer = NULL;
+ else
+ buf = NULL;
+ }
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ if (buf) {
+ bvcount = buf->max;
+ memset(buf, 0, struct_size(buf, bvec, bvcount));
+ buf->ops = ops;
+ buf->max = bvcount;
+ return buf;
+ }
+ }
+
+ buf = kzalloc(size, gfp);
+ if (!buf) {
+ *error = -ENOMEM;
+ return NULL;
+ }
+
+ buf->ops = ops;
+ buf->max = bvcount;
return buf;
}
EXPORT_SYMBOL(pipe_alloc_buffer);
@@ -323,23 +360,43 @@ EXPORT_SYMBOL(pipe_alloc_buffer);
ssize_t pipe_add(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
bool *full)
{
- unsigned int head = pipe->head;
- unsigned int tail = pipe->tail;
+ if (buf->size == 0 || WARN_ON(pipe_full(pipe)))
+ goto discard;
- if (WARN_ON(pipe_full(head, tail, pipe->max_usage)))
- goto error;
+ spin_lock_irq(&pipe->rd_wait.lock);
+ list_add_tail(&buf->queue_link, &pipe->queue);
+ pipe->footprint += buf->footprint;
+ *full = pipe_full(pipe);
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ return buf->size;
- pipe->head = head + 1;
- *full = pipe_full(head, tail, pipe->max_usage);
- return buf->len;
-
-error:
+discard:
pipe_buf_release(pipe, buf);
- *full = true;
- return -EAGAIN;
+ *full = pipe_full(pipe);
+ return 0;
}
EXPORT_SYMBOL(pipe_add);
+/**
+ * pipe_buf_release - put a reference to a pipe_buffer
+ * @pipe: the pipe that the buffer belongs to
+ * @buf: the buffer to put a reference to
+ */
+void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+{
+ const struct pipe_buf_operations *ops = buf->ops;
+
+ if (ops)
+ ops->release(pipe, buf);
+ if (buf->index >= buf->nr) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+ pipe->footprint -= buf->footprint;
+ list_del(&buf->queue_link);
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ kfree(buf);
+ }
+}
+
#ifdef CONFIG_WATCH_QUEUE
/**
* pipe_set_lost_mark - Mark the pipe as having lost some data
@@ -356,39 +413,96 @@ void pipe_set_lost_mark(struct pipe_inode_info *pipe)
{
struct pipe_buffer *buf;
- if (pipe_empty(pipe->head, pipe->tail)) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+ if (pipe_empty(pipe)) {
pipe->note_loss = true;
} else {
- buf = pipe_buf(pipe, pipe->head - 1);
+ buf = list_last_entry(&pipe->queue, struct pipe_buffer, queue_link);
buf->flags |= PIPE_BUF_FLAG_LOSS;
}
+ spin_unlock_irq(&pipe->rd_wait.lock);
}
#endif
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
- unsigned int head = READ_ONCE(pipe->head);
- unsigned int tail = READ_ONCE(pipe->tail);
- unsigned int writers = READ_ONCE(pipe->writers);
-
- return !pipe_empty(head, tail) || !writers;
+ return !pipe_empty(pipe) || !READ_ONCE(pipe->writers);
}
-static ssize_t
-pipe_read(struct kiocb *iocb, struct iov_iter *to)
+/*
+ * Deal with the consumption of some data from a pipe buffer. Returns true if
+ * we've consumed all the data.
+ */
+bool pipe_consume(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t consumed)
{
- size_t total_len = iov_iter_count(to);
+ if (WARN_ON_ONCE(consumed > buf->size))
+ consumed = buf->size;
+ buf->size -= consumed;
+
+ do {
+ struct bio_vec *bv = &buf->bvec[buf->index];
+ size_t part = min_t(size_t, consumed, bv->bv_len);
+
+ bv->bv_len -= part;
+ bv->bv_offset += part;
+ consumed -= part;
+
+ if (bv->bv_len > 0)
+ break;
+
+ buf->ops->release(pipe, buf);
+ buf->index++;
+ } while (consumed > 0);
+
+ return buf->size == 0;
+}
+
+/*
+ * Copy data from a pipe buffer into an iterator, confirming the pages in the
+ * buffer as we use them and releasing them when we've used them.
+ */
+static ssize_t pipe_copy_buf_to_iter(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ struct iov_iter *iter)
+{
+ size_t part, n, copied = 0;
+ int ret = 0;
+
+ while (buf->size) {
+ struct bio_vec *bv = &buf->bvec[buf->nr];
+
+ if (buf->nr_confirmed <= buf->index) {
+ ret = pipe_buf_confirm(pipe, buf);
+ if (ret < 0)
+ break;
+ }
+
+ part = min_t(size_t, bv->bv_len, iov_iter_count(iter));
+ n = copy_folio_to_iter(bv->bv_folio, bv->bv_offset, part, iter);
+ if (unlikely(n < part)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ copied += n;
+ pipe_consume(pipe, buf, n);
+ }
+
+ return copied ?: ret;
+}
+
+static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *iter)
+{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- bool was_full, wake_next_reader = false;
- ssize_t ret;
+ bool was_full, wake_next_reader = false, stop;
+ ssize_t copied = 0, ret = 0;
/* Null read succeeds. */
- if (unlikely(total_len == 0))
+ if (unlikely(!iov_iter_count(iter)))
return 0;
- ret = 0;
__pipe_lock(pipe);
/*
@@ -399,90 +513,57 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
* (WF_SYNC), because we want them to get going and generate more
* data for us.
*/
- was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+ was_full = pipe_full(pipe);
for (;;) {
- /* Read ->head with a barrier vs post_one_notification() */
- unsigned int head = smp_load_acquire(&pipe->head);
- unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
+ struct pipe_buffer *buf;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->note_loss) {
struct watch_notification n;
- if (total_len < 8) {
- if (ret == 0)
- ret = -ENOBUFS;
+ if (iov_iter_count(iter) < 8) {
+ ret = -ENOBUFS;
break;
}
n.type = WATCH_TYPE_META;
n.subtype = WATCH_META_LOSS_NOTIFICATION;
n.info = watch_sizeof(n);
- if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
+ if (copy_to_iter(&n, sizeof(n), iter) != sizeof(n)) {
if (ret == 0)
ret = -EFAULT;
break;
}
- ret += sizeof(n);
- total_len -= sizeof(n);
+ copied += sizeof(n);
pipe->note_loss = false;
}
#endif
- if (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
- size_t chars = buf->len;
- size_t written;
- int error;
-
- if (chars > total_len) {
- if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
- if (ret == 0)
- ret = -ENOBUFS;
- break;
- }
- chars = total_len;
- }
-
- error = pipe_buf_confirm(pipe, buf);
- if (error) {
- if (!ret)
- ret = error;
- break;
- }
-
- written = copy_page_to_iter(buf->page, buf->offset, chars, to);
- if (unlikely(written < chars)) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += chars;
- buf->offset += chars;
- buf->len -= chars;
+ buf = pipe_head_buf(pipe);
+ if (buf) {
+ if (buf->ops->copy_to_iter)
+ ret = buf->ops->copy_to_iter(pipe, buf, iter);
+ else
+ ret = pipe_copy_buf_to_iter(pipe, buf, iter);
+ if (ret > 0)
+ copied += ret;
/* Was it a packet buffer? Clean up and exit */
- if (buf->flags & PIPE_BUF_FLAG_PACKET) {
- total_len = chars;
- buf->len = 0;
- }
+ stop = buf->flags & PIPE_BUF_FLAG_PACKET;
+ if (stop)
+ buf->size = 0;
- if (!buf->len) {
- pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->rd_wait.lock);
+ if (!buf->size) {
#ifdef CONFIG_WATCH_QUEUE
if (buf->flags & PIPE_BUF_FLAG_LOSS)
pipe->note_loss = true;
#endif
- tail++;
- pipe->tail = tail;
- spin_unlock_irq(&pipe->rd_wait.lock);
+ pipe_buf_release(pipe, buf);
}
- total_len -= chars;
- if (!total_len)
+
+ if (!iov_iter_count(iter))
break; /* common path: read succeeded */
- if (!pipe_empty(head, tail)) /* More to do? */
+ if (!pipe_empty(pipe)) /* More to do? */
continue;
}
@@ -527,10 +608,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
return -ERESTARTSYS;
__pipe_lock(pipe);
- was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+ was_full = pipe_full(pipe);
wake_next_reader = true;
}
- if (pipe_empty(pipe->head, pipe->tail))
+ if (pipe_empty(pipe))
wake_next_reader = false;
__pipe_unlock(pipe);
@@ -541,7 +622,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
if (ret > 0)
file_accessed(filp);
- return ret;
+ return copied ?: ret;
}
static inline int is_packetized(struct file *file)
@@ -552,25 +633,47 @@ static inline int is_packetized(struct file *file)
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
- unsigned int head = READ_ONCE(pipe->head);
- unsigned int tail = READ_ONCE(pipe->tail);
- unsigned int max_usage = READ_ONCE(pipe->max_usage);
-
- return !pipe_full(head, tail, max_usage) ||
- !READ_ONCE(pipe->readers);
+ return !pipe_full(pipe) || !READ_ONCE(pipe->readers);
}
-static ssize_t
-pipe_write(struct kiocb *iocb, struct iov_iter *from)
+/*
+ * copy_iter_to_folio - Copy data from an iterator into a folio
+ * @iter: Source iterator
+ * @folio: Destination folio
+ * @offset: Offset within the folio to start writing
+ * @len: Amount to copy
+ */
+static ssize_t copy_iter_to_folio(struct iov_iter *iter, struct folio *folio,
+ size_t offset, size_t len)
+{
+ size_t copied = 0;
+
+ while (len > 0 && iov_iter_count(iter) > 0) {
+ size_t pnum = offset / PAGE_SIZE;
+ size_t poff = offset & ~PAGE_MASK;
+ size_t part = min3(len, PAGE_SIZE - offset, iov_iter_count(iter));
+ size_t n;
+
+ n = copy_page_from_iter(folio_page(folio, pnum), poff, part, iter);
+ offset += n;
+ copied += n;
+ if (n < part)
+ return copied ?: -EFAULT;
+ }
+
+ return copied;
+}
+
+static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int head;
- ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
- ssize_t chars;
+ ssize_t written = 0, chars;
bool was_empty = false;
bool wake_next_writer = false;
+ bool full = pipe_full(pipe);
+ int ret = 0;
/* Null write succeeds. */
if (unlikely(total_len == 0))
@@ -599,27 +702,28 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* page-aligns the rest of the writes for large writes
* spanning multiple pages.
*/
- head = pipe->head;
- was_empty = pipe_empty(head, pipe->tail);
+ was_empty = pipe_empty(pipe);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
- int offset = buf->offset + buf->len;
+ struct pipe_buffer *buf =
+ list_last_entry(&pipe->queue,
+ struct pipe_buffer, queue_link);
+ struct bio_vec *bv = &buf->bvec[0];
+ size_t offset = bv->bv_offset + bv->bv_len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
- offset + chars <= PAGE_SIZE) {
+ offset + chars <= folio_size(bv->bv_folio)) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
- ret = copy_page_from_iter(buf->page, offset, chars, from);
+ ret = copy_iter_to_folio(from, bv->bv_folio, offset, chars);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}
- buf->len += ret;
+ buf->size += ret;
if (!iov_iter_count(from))
goto out;
}
@@ -628,81 +732,69 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
- if (!ret)
- ret = -EPIPE;
+ ret = -EPIPE;
break;
}
- head = pipe->head;
- if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
- struct page *page = pipe->tmp_page;
- int copied;
+ if (!full) {
+ struct pipe_buffer *buf;
+ struct folio *folio = pipe->spare_folio;
+ ssize_t copied;
+ size_t part;
- if (!page) {
- page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
- if (unlikely(!page)) {
- ret = ret ? : -ENOMEM;
+ buf = pipe_alloc_buffer(pipe, &anon_pipe_buf_ops,
+ 1, GFP_KERNEL, &ret);
+ if (!buf)
+ break;
+
+ folio = pipe->spare_folio;
+ if (!folio) {
+ folio = folio_alloc(GFP_HIGHUSER | __GFP_ACCOUNT, 0);
+ if (unlikely(!folio)) {
+ ret = -ENOMEM;
break;
}
- pipe->tmp_page = page;
+ } else {
+ pipe->spare_folio = NULL;
}
- /* Allocate a slot in the ring in advance and attach an
- * empty buffer. If we fault or otherwise fail to use
- * it, either the reader will consume it or it'll still
- * be there for the next write.
- */
- spin_lock_irq(&pipe->rd_wait.lock);
+ buf->bvec[0].bv_folio = folio;
+ buf->bvec[0].bv_offset = 0;
+ buf->bvec[0].bv_len = 0;
+ buf->nr = 1;
+ buf->footprint += folio_nr_pages(folio);
- head = pipe->head;
- if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- continue;
- }
-
- pipe->head = head + 1;
- spin_unlock_irq(&pipe->rd_wait.lock);
-
- /* Insert it into the buffer array */
- buf = &pipe->bufs[head & mask];
- buf->page = page;
- buf->ops = &anon_pipe_buf_ops;
- buf->offset = 0;
- buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
- pipe->tmp_page = NULL;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ part = min(iov_iter_count(from), folio_size(folio));
+ copied = copy_iter_to_folio(from, folio, 0, folio_size(folio));
+ if (unlikely(copied < part)) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
- buf->offset = 0;
- buf->len = copied;
+ buf->bvec[0].bv_len += copied;
+ buf->size += copied;
+ ret = pipe_add(pipe, buf, &full);
if (!iov_iter_count(from))
break;
}
- if (!pipe_full(head, pipe->tail, pipe->max_usage))
+ if (!full)
continue;
/* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
- if (!ret)
- ret = -EAGAIN;
+ ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
- if (!ret)
- ret = -ERESTARTSYS;
+ ret = -ERESTARTSYS;
break;
}
@@ -718,11 +810,12 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
- was_empty = pipe_empty(pipe->head, pipe->tail);
+ was_empty = pipe_empty(pipe);
wake_next_writer = true;
+ full = pipe_full(pipe);
}
out:
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_full(pipe))
wake_next_writer = false;
__pipe_unlock(pipe);
@@ -743,44 +836,35 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
- int err = file_update_time(filp);
- if (err)
- ret = err;
+ if (written && sb_start_write_trylock(file_inode(filp)->i_sb)) {
+ ret = file_update_time(filp);
+ if (ret)
+ written = ret;
sb_end_write(file_inode(filp)->i_sb);
}
- return ret;
+ return written ?: ret;
}
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int count, head, tail, mask;
+ struct pipe_buffer *buf;
+ unsigned int count;
switch (cmd) {
case FIONREAD:
__pipe_lock(pipe);
count = 0;
- head = pipe->head;
- tail = pipe->tail;
- mask = pipe->ring_size - 1;
-
- while (tail != head) {
- count += pipe->bufs[tail & mask].len;
- tail++;
+ list_for_each_entry(buf, &pipe->queue, queue_link) {
+ count += buf->size;
}
__pipe_unlock(pipe);
return put_user(count, (int __user *)arg);
#ifdef CONFIG_WATCH_QUEUE
- case IOC_WATCH_QUEUE_SET_SIZE: {
- int ret;
- __pipe_lock(pipe);
- ret = watch_queue_set_size(pipe, arg);
- __pipe_unlock(pipe);
- return ret;
- }
+ case IOC_WATCH_QUEUE_SET_SIZE:
+ return 0; /* Does nothing for the moment. */
case IOC_WATCH_QUEUE_SET_FILTER:
return watch_queue_set_filter(
@@ -798,7 +882,6 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int head, tail;
/* Epoll has some historical nasty semantics, this enables them */
WRITE_ONCE(pipe->poll_usage, true);
@@ -819,19 +902,16 @@ pipe_poll(struct file *filp, poll_table *wait)
* if something changes and you got it wrong, the poll
* table entry will wake you up and fix it.
*/
- head = READ_ONCE(pipe->head);
- tail = READ_ONCE(pipe->tail);
-
mask = 0;
if (filp->f_mode & FMODE_READ) {
- if (!pipe_empty(head, tail))
+ if (!pipe_empty(pipe))
mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
- if (!pipe_full(head, tail, pipe->max_usage))
+ if (!pipe_full(pipe))
mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
@@ -902,27 +982,27 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
-unsigned long account_pipe_buffers(struct user_struct *user,
- unsigned long old, unsigned long new)
+static unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new)
{
return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-bool too_many_pipe_buffers_soft(unsigned long user_bufs)
+static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
return soft_limit && user_bufs > soft_limit;
}
-bool too_many_pipe_buffers_hard(unsigned long user_bufs)
+static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
return hard_limit && user_bufs > hard_limit;
}
-bool pipe_is_unprivileged_user(void)
+static bool pipe_is_unprivileged_user(void)
{
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}
@@ -930,45 +1010,38 @@ bool pipe_is_unprivileged_user(void)
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
- unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
- unsigned long user_bufs;
- unsigned int max_size = READ_ONCE(pipe_max_size);
+ size_t limit = PIPE_DEF_BUFFERS, user_bufs;
+ size_t sys = min_t(size_t, DIV_ROUND_UP(READ_ONCE(pipe_max_size), PAGE_SIZE), 1);
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe == NULL)
goto out_free_uid;
- if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
- pipe_bufs = max_size >> PAGE_SHIFT;
+ if (limit > sys && !capable(CAP_SYS_RESOURCE))
+ limit = sys;
- user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
+ user_bufs = account_pipe_buffers(user, 0, limit);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
- user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
- pipe_bufs = PIPE_MIN_DEF_BUFFERS;
+ user_bufs = account_pipe_buffers(user, limit, PIPE_MIN_DEF_BUFFERS);
+ limit = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
- pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
- GFP_KERNEL_ACCOUNT);
-
- if (pipe->bufs) {
- init_waitqueue_head(&pipe->rd_wait);
- init_waitqueue_head(&pipe->wr_wait);
- pipe->r_counter = pipe->w_counter = 1;
- pipe->max_usage = pipe_bufs;
- pipe->ring_size = pipe_bufs;
- pipe->nr_accounted = pipe_bufs;
- pipe->user = user;
- mutex_init(&pipe->mutex);
- return pipe;
- }
+ INIT_LIST_HEAD(&pipe->queue);
+ init_waitqueue_head(&pipe->rd_wait);
+ init_waitqueue_head(&pipe->wr_wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->max_footprint = limit;
+ pipe->user = user;
+ mutex_init(&pipe->mutex);
+ return pipe;
out_revert_acct:
- (void) account_pipe_buffers(user, pipe_bufs, 0);
+ (void) account_pipe_buffers(user, limit, 0);
kfree(pipe);
out_free_uid:
free_uid(user);
@@ -977,27 +1050,26 @@ struct pipe_inode_info *alloc_pipe_info(void)
void free_pipe_info(struct pipe_inode_info *pipe)
{
- unsigned int i;
+ struct pipe_buffer *buf;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue)
watch_queue_clear(pipe->watch_queue);
#endif
- (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->footprint, 0);
free_uid(pipe->user);
- for (i = 0; i < pipe->ring_size; i++) {
- struct pipe_buffer *buf = pipe->bufs + i;
- if (buf->ops)
- pipe_buf_release(pipe, buf);
+ while ((buf = list_first_entry_or_null(
+ &pipe->queue, struct pipe_buffer, queue_link))) {
+ pipe_buf_release(pipe, buf);
}
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue)
put_watch_queue(pipe->watch_queue);
#endif
- if (pipe->tmp_page)
- __free_page(pipe->tmp_page);
- kfree(pipe->bufs);
+ if (pipe->spare_folio)
+ folio_put(pipe->spare_folio);
+ kfree(pipe->spare_buffer);
kfree(pipe);
}
@@ -1376,96 +1448,14 @@ const struct file_operations pipefifo_fops = {
};
/*
- * Currently we rely on the pipe array holding a power-of-2 number
- * of pages. Returns 0 on error.
- */
-static unsigned int round_pipe_size(unsigned long size)
-{
- if (size > (1U << 31))
- return 0;
-
- /* Minimum pipe size, as required by POSIX */
- if (size < PAGE_SIZE)
- return PAGE_SIZE;
-
- return roundup_pow_of_two(size);
-}
-
-/*
- * Resize the pipe ring to a number of slots.
- *
- * Note the pipe can be reduced in capacity, but only if the current
- * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
- * returned instead.
- */
-int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
-{
- struct pipe_buffer *bufs;
- unsigned int head, tail, mask, n;
-
- bufs = kcalloc(nr_slots, sizeof(*bufs),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs))
- return -ENOMEM;
-
- spin_lock_irq(&pipe->rd_wait.lock);
- mask = pipe->ring_size - 1;
- head = pipe->head;
- tail = pipe->tail;
-
- n = pipe_occupancy(head, tail);
- if (nr_slots < n) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- kfree(bufs);
- return -EBUSY;
- }
-
- /*
- * The pipe array wraps around, so just start the new one at zero
- * and adjust the indices.
- */
- if (n > 0) {
- unsigned int h = head & mask;
- unsigned int t = tail & mask;
- if (h > t) {
- memcpy(bufs, pipe->bufs + t,
- n * sizeof(struct pipe_buffer));
- } else {
- unsigned int tsize = pipe->ring_size - t;
- if (h > 0)
- memcpy(bufs + tsize, pipe->bufs,
- h * sizeof(struct pipe_buffer));
- memcpy(bufs, pipe->bufs + t,
- tsize * sizeof(struct pipe_buffer));
- }
- }
-
- head = n;
- tail = 0;
-
- kfree(pipe->bufs);
- pipe->bufs = bufs;
- pipe->ring_size = nr_slots;
- if (pipe->max_usage > nr_slots)
- pipe->max_usage = nr_slots;
- pipe->tail = tail;
- pipe->head = head;
-
- spin_unlock_irq(&pipe->rd_wait.lock);
-
- /* This might have made more room for writers */
- wake_up_interruptible(&pipe->wr_wait);
- return 0;
-}
-
-/*
- * Allocate a new array of pipe buffers and copy the info over. Returns the
- * pipe size if successful, or return -ERROR on error.
+ * Change the limit on the amount of data allowed into a pipe. Returns the pipe
+ * size if successful, or return -ERROR on error.
*/
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
unsigned long user_bufs;
- unsigned int nr_slots, size;
+ size_t limit;
+ size_t sys = min_t(size_t, DIV_ROUND_UP(pipe_max_size, PAGE_SIZE), 1);
long ret = 0;
#ifdef CONFIG_WATCH_QUEUE
@@ -1473,43 +1463,34 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
return -EBUSY;
#endif
- size = round_pipe_size(arg);
- nr_slots = size >> PAGE_SHIFT;
-
- if (!nr_slots)
- return -EINVAL;
+ limit = DIV_ROUND_UP(arg, PAGE_SIZE);
+ limit = min_t(size_t, limit, 1);
/*
- * If trying to increase the pipe capacity, check that an
- * unprivileged user is not trying to exceed various limits
- * (soft limit check here, hard limit check just below).
- * Decreasing the pipe capacity is always permitted, even
- * if the user is currently over a limit.
+ * If trying to increase the pipe capacity, check that an unprivileged
+ * user is not trying to exceed various limits (soft limit check here,
+ * hard limit check just below). Decreasing the pipe capacity is
+ * always permitted, even if the user is currently over a limit.
*/
- if (nr_slots > pipe->max_usage &&
- size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ if (limit > pipe->max_footprint &&
+ limit > sys && !capable(CAP_SYS_RESOURCE))
return -EPERM;
- user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
+ user_bufs = account_pipe_buffers(pipe->user, pipe->max_footprint, limit);
- if (nr_slots > pipe->max_usage &&
- (too_many_pipe_buffers_hard(user_bufs) ||
- too_many_pipe_buffers_soft(user_bufs)) &&
- pipe_is_unprivileged_user()) {
+ if (limit > pipe->max_footprint &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
ret = -EPERM;
goto out_revert_acct;
}
- ret = pipe_resize_ring(pipe, nr_slots);
- if (ret < 0)
- goto out_revert_acct;
-
- pipe->max_usage = nr_slots;
- pipe->nr_accounted = nr_slots;
- return pipe->max_usage * PAGE_SIZE;
+ pipe->max_footprint = limit;
+ return pipe->max_footprint * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
+ (void) account_pipe_buffers(pipe->user, limit, pipe->max_footprint);
return ret;
}
@@ -1546,7 +1527,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
ret = pipe_set_size(pipe, arg);
break;
case F_GETPIPE_SZ:
- ret = pipe->max_usage * PAGE_SIZE;
+ ret = pipe->max_footprint;
break;
default:
ret = -EINVAL;
@@ -1593,7 +1574,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
if (write) {
unsigned int val;
- val = round_pipe_size(*lvalp);
+ val = round_up(*lvalp, PAGE_SIZE);
if (val == 0)
return -EINVAL;
diff --git a/fs/pipe.h b/fs/pipe.h
index 0d749bf..ef11dff 100644
--- a/fs/pipe.h
+++ b/fs/pipe.h
@@ -3,15 +3,15 @@
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
+ * @queue: The pipe buffer.
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
- * @head: The point of buffer production
- * @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
- * @max_usage: The maximum number of slots that may be used in the ring
- * @ring_size: total number of buffers (should be a power of 2)
- * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
- * @tmp_page: cached released page
+ * @footprint: The amount of space pinned by the pipe (in pages).
+ * @max_footprint: The maximum amount of space that can be pinned (in pages).
+ * @content: The amount of content (in bytes).
+ * @spare_folio: Cached released folio
+ * @spare_buffer: Cached released buffer
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
@@ -20,87 +20,70 @@
* @poll_usage: is this pipe used for epoll, which has crazy wakeups?
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
- * @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
- struct mutex mutex;
- wait_queue_head_t rd_wait, wr_wait;
- unsigned int head;
- unsigned int tail;
- unsigned int max_usage;
- unsigned int ring_size;
+ struct mutex mutex;
+ struct list_head queue;
+ wait_queue_head_t rd_wait, wr_wait;
#ifdef CONFIG_WATCH_QUEUE
- bool note_loss;
+ bool note_loss;
#endif
- unsigned int nr_accounted;
- unsigned int readers;
- unsigned int writers;
- unsigned int files;
- unsigned int r_counter;
- unsigned int w_counter;
- bool poll_usage;
- struct page *tmp_page;
- struct fasync_struct *fasync_readers;
- struct fasync_struct *fasync_writers;
- struct pipe_buffer *bufs;
- struct user_struct *user;
+ size_t footprint;
+ size_t max_footprint;
+ size_t content;
+ unsigned int readers;
+ unsigned int writers;
+ unsigned int files;
+ unsigned int r_counter;
+ unsigned int w_counter;
+ bool poll_usage;
+ struct folio *spare_folio;
+ struct pipe_buffer *spare_buffer;
+ struct fasync_struct *fasync_readers;
+ struct fasync_struct *fasync_writers;
+ struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
- struct watch_queue *watch_queue;
+ struct watch_queue *watch_queue;
#endif
};
/**
* pipe_empty - Return true if the pipe is empty
- * @head: The pipe ring head pointer
- * @tail: The pipe ring tail pointer
+ * @pipe: The pipe to query
*/
-static inline bool pipe_empty(unsigned int head, unsigned int tail)
+static inline bool pipe_empty(const struct pipe_inode_info *pipe)
{
- return head == tail;
-}
-
-/**
- * pipe_occupancy - Return number of slots used in the pipe
- * @head: The pipe ring head pointer
- * @tail: The pipe ring tail pointer
- */
-static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
-{
- return head - tail;
+ return list_empty(&pipe->queue);
}
/**
* pipe_full - Return true if the pipe is full
- * @head: The pipe ring head pointer
- * @tail: The pipe ring tail pointer
- * @limit: The maximum amount of slots available.
+ * @pipe: The pipe to query
*/
-static inline bool pipe_full(unsigned int head, unsigned int tail,
- unsigned int limit)
+static inline bool pipe_full(const struct pipe_inode_info *pipe)
{
- return pipe_occupancy(head, tail) >= limit;
+ return pipe->footprint >= pipe->max_footprint;
}
/**
- * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring
- * @pipe: The pipe to access
- * @slot: The slot of interest
+ * pipe_occupancy - Return number of pages remaining in a pipe
+ * @pipe: The pipe to query
*/
-static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
- unsigned int slot)
+static inline size_t pipe_occupancy(const struct pipe_inode_info *pipe)
{
- return &pipe->bufs[slot & (pipe->ring_size - 1)];
+ return min_t(ssize_t, pipe->max_footprint - pipe->footprint, 0);
}
/**
- * pipe_head_buf - Return the pipe buffer at the head of the pipe ring
+ * pipe_head_buf - Return the head pipe buffer or NULL
* @pipe: The pipe to access
*/
-static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe)
+static inline struct pipe_buffer *pipe_head_buf(struct pipe_inode_info *pipe)
{
- return pipe_buf(pipe, pipe->head);
+ return list_first_entry_or_null(&pipe->queue,
+ struct pipe_buffer, queue_link);
}
/* Wait for a pipe to be readable/writable while dropping the pipe lock */
@@ -108,16 +91,4 @@ void pipe_wait_readable(struct pipe_inode_info *);
void pipe_wait_writable(struct pipe_inode_info *);
struct pipe_inode_info *alloc_pipe_info(void);
-
-#ifdef CONFIG_WATCH_QUEUE
-unsigned long account_pipe_buffers(struct user_struct *user,
- unsigned long old, unsigned long new);
-bool too_many_pipe_buffers_soft(unsigned long user_bufs);
-bool too_many_pipe_buffers_hard(unsigned long user_bufs);
-bool pipe_is_unprivileged_user(void);
-#endif
-
-/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
-#ifdef CONFIG_WATCH_QUEUE
-int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
-#endif
+bool pipe_consume(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t consumed);
diff --git a/fs/splice.c b/fs/splice.c
index 8bbbb19..1889b86 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -48,7 +48,7 @@
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct folio *folio = page_folio(buf->page);
+ struct folio *folio = page_folio(buf->bvec[buf->index].bv_page);
struct address_space *mapping;
folio_lock(folio);
@@ -93,8 +93,9 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- put_page(buf->page);
- buf->flags &= ~PIPE_BUF_FLAG_LRU;
+ put_page(buf->bvec[buf->index++].bv_page);
+ if (buf->index == buf->nr)
+ buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
/*
@@ -104,38 +105,34 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->bvec[buf->index].bv_page);
int err;
- if (!PageUptodate(page)) {
- lock_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_lock(folio);
/*
- * Page got truncated/unhashed. This will cause a 0-byte
+ * Folio got truncated/unhashed. This will cause a 0-byte
* splice, if this is the first page.
*/
- if (!page->mapping) {
+ if (!folio->mapping) {
err = -ENODATA;
goto error;
}
- /*
- * Uh oh, read-error from disk.
- */
- if (!PageUptodate(page)) {
+ /* Uh oh, read-error from disk. */
+ if (!folio_test_uptodate(folio)) {
err = -EIO;
goto error;
}
- /*
- * Page is ok afterall, we are done.
- */
- unlock_page(page);
+ /* Folio is ok afterall, we are done. */
+ folio_unlock(folio);
}
return 0;
error:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
@@ -162,116 +159,25 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
-static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
-{
- smp_mb();
- if (waitqueue_active(&pipe->rd_wait))
- wake_up_interruptible(&pipe->rd_wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-}
-
-/**
- * splice_to_pipe - fill passed data into a pipe
- * @pipe: pipe to fill
- * @spd: data to fill
- *
- * Description:
- * @spd contains a map of pages and len/offset tuples, along with
- * the struct pipe_buf_operations associated with these pages. This
- * function will link that data to the pipe.
- *
- */
-ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
- struct splice_pipe_desc *spd)
-{
- struct pipe_buffer *buf;
- unsigned int spd_pages = spd->nr_pages;
- size_t len = INT_MAX, spliced = 0;
- bool full = false;
- int ret = -EAGAIN, page_nr = 0;
-
- if (!spd_pages)
- return 0;
-
- if (!pipe_query_space(pipe, &len, &ret))
- goto out;
-
- do {
- buf = pipe_alloc_buffer(pipe, spd->ops, 1, GFP_KERNEL, &ret);
- if (!buf)
- goto out;
-
- buf->page = spd->pages[page_nr];
- buf->offset = spd->partial[page_nr].offset;
- buf->len = spd->partial[page_nr].len;
- buf->private = spd->partial[page_nr].private;
- page_nr++;
- spd->nr_pages--;
-
- spliced += pipe_add(pipe, buf, &full);
- } while (!full && spd->nr_pages);
-
-out:
- while (page_nr < spd_pages)
- spd->spd_release(spd, page_nr++);
-
- return spliced ?: ret;
-}
-EXPORT_SYMBOL_GPL(splice_to_pipe);
-
-ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
-{
- unsigned int head = pipe->head;
- unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
- int ret;
-
- if (unlikely(!pipe->readers)) {
- send_sig(SIGPIPE, current, 0);
- ret = -EPIPE;
- } else if (pipe_full(head, tail, pipe->max_usage)) {
- ret = -EAGAIN;
- } else {
- pipe->bufs[head & mask] = *buf;
- pipe->head = head + 1;
- return buf->len;
- }
- pipe_buf_release(pipe, buf);
- return ret;
-}
-EXPORT_SYMBOL(add_to_pipe);
-
/*
* Check if we need to grow the arrays holding pages and partial page
* descriptions.
*/
-int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+int splice_grow_buf(const struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- unsigned int max_usage = READ_ONCE(pipe->max_usage);
+ size_t was = struct_size(buf, bvec, buf->nr);
+ size_t to = struct_size(buf, bvec, buf->nr + 1);
- spd->nr_pages_max = max_usage;
- if (max_usage <= PIPE_DEF_BUFFERS)
- return 0;
+ buf = krealloc(buf, to, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
- spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
- GFP_KERNEL);
-
- if (spd->pages && spd->partial)
- return 0;
-
- kfree(spd->pages);
- kfree(spd->partial);
- return -ENOMEM;
+ memset((void *)buf + was, 0, to - was);
+ return 0;
}
-void splice_shrink_spd(struct splice_pipe_desc *spd)
+void splice_shrink_buf(struct pipe_buffer *buf)
{
- if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
- return;
-
- kfree(spd->pages);
- kfree(spd->partial);
}
/*
@@ -282,29 +188,35 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
+ struct pipe_buffer *buf;
struct iov_iter to;
- struct bio_vec *bv;
struct kiocb kiocb;
struct page **pages;
ssize_t ret;
- size_t used, npages, chunk, remain, keep = 0;
- int i;
+ size_t npages, chunk, remain, keep;
+ bool full = false;
+ int i, error = -EAGAIN;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
- npages = max_t(ssize_t, pipe->max_usage - used, 0);
- len = min_t(size_t, len, npages * PAGE_SIZE);
- npages = DIV_ROUND_UP(len, PAGE_SIZE);
+ npages = pipe_query_space(pipe, &len, &error);
+ if (!npages)
+ return error;
- bv = kzalloc(array_size(npages, sizeof(bv[0])) +
- array_size(npages, sizeof(struct page *)), GFP_KERNEL);
- if (!bv)
+ buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, npages,
+ GFP_KERNEL, &error);
+ if (!buf)
+ return error;
+
+ pages = kzalloc(array_size(npages, sizeof(struct page *)), GFP_KERNEL);
+ if (!pages) {
+ kfree(buf);
return -ENOMEM;
+ }
- pages = (struct page **)(bv + npages);
npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
if (!npages) {
- kfree(bv);
+ kfree(buf);
+ kfree(pages);
return -ENOMEM;
}
@@ -312,14 +224,14 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
for (i = 0; i < npages; i++) {
chunk = min_t(size_t, PAGE_SIZE, remain);
- bv[i].bv_page = pages[i];
- bv[i].bv_offset = 0;
- bv[i].bv_len = chunk;
+ buf->bvec[i].bv_page = pages[i];
+ buf->bvec[i].bv_offset = 0;
+ buf->bvec[i].bv_len = chunk;
remain -= chunk;
}
/* Do the I/O */
- iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+ iov_iter_bvec(&to, ITER_DEST, buf->bvec, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -340,24 +252,12 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos,
/* Free any pages that didn't get touched at all. */
if (keep < npages)
release_pages(pages + keep, npages - keep);
+ buf->nr = npages;
+ kfree(pages);
- /* Push the remaining pages into the pipe. */
- remain = ret;
- for (i = 0; i < keep; i++) {
- struct pipe_buffer *buf = pipe_head_buf(pipe);
-
- chunk = min_t(size_t, remain, PAGE_SIZE);
- *buf = (struct pipe_buffer) {
- .ops = &default_pipe_buf_ops,
- .page = bv[i].bv_page,
- .offset = 0,
- .len = chunk,
- };
- pipe->head++;
- remain -= chunk;
- }
-
- kfree(bv);
+ /* Push the remaining pages into the pipe (will discard the
+ * buf if empty). */
+ pipe_add(pipe, buf, &full);
return ret;
}
EXPORT_SYMBOL(direct_splice_read);
@@ -418,9 +318,8 @@ static int pipe_to_sendmsg(struct pipe_inode_info *pipe, struct splice_desc *sd,
if (sd->flags & SPLICE_F_MORE)
msg.msg_flags |= MSG_MORE;
- if (sd->len < sd->total_len &&
- pipe_occupancy(pipe->head, pipe->tail) > 1)
- msg.msg_flags |= MSG_MORE;
+ if (sd->len < sd->total_len)
+ msg.msg_flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bv, nr_bv, sd->len);
return sock_sendmsg(sock, &msg);
@@ -436,44 +335,55 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
}
/*
- * Try to steal the page from a pipe buffer and if we fail, copy the page and
- * replace the pipe buffer with one that points to the copy.
+ * Try to steal the page from a pipe buffer and if we fail, copy the folio and
+ * update the pipe buffer to point to the copy.
*/
static int splice_steal_or_copy(struct pipe_inode_info *pipe,
struct pipe_buffer *buf,
struct splice_desc *sd)
{
- if (!pipe_buf_try_steal(pipe, buf)) {
- /* Fall back to replacing the buffer page with a copy. */
- struct page *page;
- size_t offset = buf->offset, len = buf->len;
- void *src, *dst;
+ struct bio_vec *bv = &buf->bvec[buf->index];
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ if (!pipe_buf_try_steal(pipe, buf)) {
+ /* Fall back to replacing the buffer folio with a copy. */
+ struct folio *folio;
+ size_t offset_d = 0;
+ size_t offset_s = bv->bv_offset, len = bv->bv_len;
+ size_t size = roundup_pow_of_two(len);
+ size_t order = ilog2(size);
+
+ WARN_ON(order > folio_order(bv->bv_folio));
+
+ folio = folio_alloc(GFP_KERNEL, order);
+ if (!folio)
return -ENOMEM;
- src = kmap_local_page(buf->page);
- dst = kmap_local_page(page);
- memcpy(dst + offset, src + offset, len);
- kunmap_local(src);
- kunmap_local(dst);
+ do {
+ void *src, *dst;
+ size_t part = min3(len,
+ PAGE_SIZE - (offset_s & ~PAGE_MASK),
+ PAGE_SIZE - (offset_d & ~PAGE_MASK));
+
+ src = kmap_local_folio(bv->bv_folio, offset_s);
+ dst = kmap_local_folio(folio, offset_d);
+ memcpy(dst, src, part);
+ kunmap_local(src);
+ kunmap_local(dst);
+ offset_s += part;
+ offset_d += part;
+ len -= part;
+ } while (len > 0);
pipe_buf_release(pipe, buf);
- *buf = (struct pipe_buffer) {
- .page = page,
- .offset = offset,
- .len = len,
- .ops = &default_pipe_buf_ops,
- };
+ bv->bv_folio = folio;
+ bv->bv_offset = 0;
} else {
/* Need to unlock the page */
- unlock_page(buf->page);
- buf->ops = &default_pipe_buf_ops;
- buf->private = 0;
+ folio_unlock(bv->bv_folio);
}
+ buf->flags |= PIPE_BUF_FLAG_IX_STOLEN;
return 0;
}
@@ -500,18 +410,11 @@ static int splice_steal_or_copy(struct pipe_inode_info *pipe,
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
- unsigned int head = pipe->head;
- unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
+ struct pipe_buffer *buf;
int ret;
- while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
- struct bio_vec bv;
-
- sd->len = buf->len;
- if (sd->len > sd->total_len)
- sd->len = sd->total_len;
+ while ((buf = pipe_head_buf(pipe))) {
+ sd->len = min(buf->size, sd->total_len);
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
@@ -526,23 +429,20 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
return ret;
}
- bvec_set_page(&bv, buf->page, buf->offset, buf->len);
- ret = actor(pipe, sd, 1, &bv);
+ ret = actor(pipe, sd, buf->nr - buf->index,
+ buf->bvec + buf->index);
if (ret <= 0)
return ret;
- buf->offset += ret;
- buf->len -= ret;
+ buf->size -= ret;
sd->num_spliced += ret;
sd->len -= ret;
sd->pos += ret;
sd->total_len -= ret;
- if (!buf->len) {
+ if (!buf->size) {
pipe_buf_release(pipe, buf);
- tail++;
- pipe->tail = tail;
if (pipe->files)
sd->need_wakeup = true;
}
@@ -557,13 +457,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
/* We know we have a pipe buffer, but maybe it's empty? */
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
- unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
- if (unlikely(!buf->len)) {
+ if (buf && unlikely(!buf->size)) {
pipe_buf_release(pipe, buf);
- pipe->tail = tail+1;
return true;
}
@@ -590,7 +487,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
return -ERESTARTSYS;
repeat:
- while (pipe_empty(pipe->head, pipe->tail)) {
+ while (pipe_empty(pipe)) {
if (!pipe->writers)
return 0;
@@ -736,67 +633,31 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.pos = *ppos,
.u.file = out,
};
- int nbufs = pipe->max_usage;
- struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
- GFP_KERNEL);
ssize_t ret;
- if (unlikely(!array))
- return -ENOMEM;
-
pipe_lock(pipe);
-
splice_from_pipe_begin(&sd);
+
while (sd.total_len) {
+ struct pipe_buffer *buf;
struct iov_iter from;
- unsigned int head, tail, mask;
- size_t left;
- int n;
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- if (unlikely(nbufs < pipe->max_usage)) {
- kfree(array);
- nbufs = pipe->max_usage;
- array = kcalloc(nbufs, sizeof(struct bio_vec),
- GFP_KERNEL);
- if (!array) {
- ret = -ENOMEM;
- break;
- }
+ buf = pipe_head_buf(pipe);
+
+ ret = pipe_buf_confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ break;
}
- head = pipe->head;
- tail = pipe->tail;
- mask = pipe->ring_size - 1;
-
- /* build the vector */
- left = sd.total_len;
- for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
- size_t this_len = buf->len;
-
- /* zero-length bvecs are not supported, skip them */
- if (!this_len)
- continue;
- this_len = min(this_len, left);
-
- ret = pipe_buf_confirm(pipe, buf);
- if (unlikely(ret)) {
- if (ret == -ENODATA)
- ret = 0;
- goto done;
- }
-
- bvec_set_page(&array[n], buf->page, this_len,
- buf->offset);
- left -= this_len;
- n++;
- }
-
- iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
+ iov_iter_bvec(&from, ITER_SOURCE,
+ buf->bvec + buf->index, buf->nr - buf->index,
+ min(sd.total_len, buf->size));
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
@@ -805,35 +666,12 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
sd.total_len -= ret;
*ppos = sd.pos;
- /* dismiss the fully eaten buffers, adjust the partial one */
- tail = pipe->tail;
- while (ret) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
- if (ret >= buf->len) {
- ret -= buf->len;
- buf->len = 0;
- pipe_buf_release(pipe, buf);
- tail++;
- pipe->tail = tail;
- if (pipe->files)
- sd.need_wakeup = true;
- } else {
- buf->offset += ret;
- buf->len -= ret;
- ret = 0;
- }
- }
+ pipe_consume(pipe, buf, ret);
}
-done:
- kfree(array);
+
splice_from_pipe_end(pipe, &sd);
-
pipe_unlock(pipe);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- return ret;
+ return sd.num_spliced ?: ret;
}
EXPORT_SYMBOL(iter_file_splice_write);
@@ -904,7 +742,7 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
return -EBADF;
/* Don't try to read more the pipe has space for. */
- p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
+ p_space = pipe->max_footprint - pipe_occupancy(pipe);
len = min_t(size_t, len, p_space << PAGE_SHIFT);
ret = rw_verify_area(READ, in, ppos, len);
@@ -937,9 +775,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
splice_direct_actor *actor)
{
struct pipe_inode_info *pipe;
+ struct pipe_buffer *buf;
long ret, bytes;
size_t len;
- int i, flags, more;
+ int flags, more;
/*
* We require the input to be seekable, as we don't want to randomly
@@ -983,7 +822,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
- WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
+ WARN_ON_ONCE(!pipe_empty(pipe));
while (len) {
size_t read_len;
@@ -1027,7 +866,6 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
}
done:
- pipe->tail = pipe->head = 0;
file_accessed(in);
return bytes;
@@ -1036,11 +874,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < pipe->ring_size; i++) {
- struct pipe_buffer *buf = &pipe->bufs[i];
-
- if (buf->ops)
- pipe_buf_release(pipe, buf);
+ while ((buf = pipe_head_buf(pipe))) {
+ buf->index = buf->nr;
+ pipe_buf_release(pipe, buf);
}
if (!bytes)
@@ -1113,7 +949,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (!pipe_full(pipe))
return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
@@ -1275,22 +1111,19 @@ static long __do_splice(struct file *in, loff_t __user *off_in,
return ret;
}
-static int iter_to_pipe(struct iov_iter *from,
- struct pipe_inode_info *pipe,
- unsigned flags)
+static int iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe,
+ unsigned int flags)
{
- struct pipe_buffer buf = {
- .ops = &user_page_pipe_buf_ops,
- .flags = flags
- };
- size_t total = 0;
+ size_t spliced = 0;
+ bool full = false;
int ret = 0;
while (iov_iter_count(from)) {
+ struct pipe_buffer *buf;
struct page *pages[16];
ssize_t left;
size_t start;
- int i, n;
+ int i;
left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
if (left <= 0) {
@@ -1298,28 +1131,29 @@ static int iter_to_pipe(struct iov_iter *from,
break;
}
- n = DIV_ROUND_UP(left + start, PAGE_SIZE);
- for (i = 0; i < n; i++) {
- int size = min_t(int, left, PAGE_SIZE - start);
+ buf = pipe_alloc_buffer(pipe, &user_page_pipe_buf_ops,
+ DIV_ROUND_UP(left + start, PAGE_SIZE),
+ GFP_KERNEL, &ret);
+ if (!buf)
+ break;
+ buf->flags |= flags;
- buf.page = pages[i];
- buf.offset = start;
- buf.len = size;
- ret = add_to_pipe(pipe, &buf);
- if (unlikely(ret < 0)) {
- iov_iter_revert(from, left);
- // this one got dropped by add_to_pipe()
- while (++i < n)
- put_page(pages[i]);
- goto out;
- }
- total += ret;
+ for (i = 0; i < buf->max; i++) {
+ size_t size = min_t(size_t, left, PAGE_SIZE - start);
+
+ bvec_set_page(&buf->bvec[i], pages[i], size, start);
+ buf->size += size;
left -= size;
start = 0;
}
+
+ buf->nr = i;
+ spliced += pipe_add(pipe, buf, &full);
+ if (full)
+ break;
}
-out:
- return total ? total : ret;
+
+ return spliced ?: ret;
}
static int pipe_to_user(struct pipe_inode_info *pipe, struct splice_desc *sd,
@@ -1491,13 +1325,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check the pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (!pipe_empty(pipe->head, pipe->tail))
+ if (!pipe_empty(pipe))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe_empty(pipe->head, pipe->tail)) {
+ while (pipe_empty(pipe)) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -1527,13 +1361,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (!pipe_full(pipe))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (pipe_full(pipe)) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1555,28 +1389,79 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
}
/*
+ * Split the front off of a buffer and paste it into another buffer.
+ */
+static void splice_split_buffer(struct pipe_buffer *ibuf,
+ struct pipe_buffer *obuf,
+ size_t len)
+{
+ unsigned int i = ibuf->ix, o = 0;
+
+ obuf->ops = ibuf->ops;
+ obuf->private = ibuf->private;
+ obuf->private_2 = ibuf->private_2;
+ obuf->size = len;
+ obuf->footprint = 0;
+ obuf->nr = ibuf->nr - ibuf->ix;
+ obuf->confirmed = ibuf->confirmed - ibuf->ix;
+
+ /*
+ * Don't inherit the gift and merge flags, we need to prevent multiple
+ * steals of this page.
+ */
+ obuf->flags = ibuf->flags &
+ ~(PIPE_BUF_FLAG_GIFT | PIPE_BUF_FLAG_CAN_MERGE);
+
+ do {
+ size_t part = min_t(size_t, ibuf->bvec[i].bv_len, len);
+
+ obuf->bvec[o] = ibuf->bvec[i];
+ obuf->bvec[o].bv_len = part;
+ obuf->footprint += folio_nr_pages(obuf->bvec[o].bv_folio);
+
+ ibuf->bvec[i].bv_offset += part;
+ ibuf->bvec[i].bv_len -= part;
+ len -= part;
+ o++;
+ if (ibuf->bvec[i].bv_len)
+ break;
+ i++;
+ if (j >= obuf->max)
+ break;
+ } while (len > 0);
+
+ ibuf->ix = i;
+ obuf->ix = o;
+
+#error need to do the page getting thing
+ obuf->ops->get_pages(obuf);
+}
+
+/*
* Splice contents of ipipe to opipe.
*/
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags)
{
- struct pipe_buffer *ibuf, *obuf;
- unsigned int i_head, o_head;
- unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
- int ret = 0;
- bool input_wakeup = false;
+ struct pipe_buffer *ibuf, *spare;
+ size_t spliced = 0;
+ int ret = -EAGAIN;
+ bool input_wakeup = false, full;
+ /* We may need to split a buffer */
+ spare = pipe_alloc_buffer(opipe, NULL, 16, GFP_KERNEL, &ret);
+ if (!spare)
+ return ret;
retry:
ret = ipipe_prep(ipipe, flags);
if (ret)
- return ret;
+ goto out;
ret = opipe_prep(opipe, flags);
if (ret)
- return ret;
+ goto out;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1585,41 +1470,29 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
*/
pipe_double_lock(ipipe, opipe);
- i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
- o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
-
+ full = pipe_full(opipe);
do {
- size_t o_len;
-
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
- if (!ret)
- ret = -EPIPE;
+ ret = -EPIPE;
break;
}
- i_head = ipipe->head;
- o_tail = opipe->tail;
-
- if (pipe_empty(i_head, i_tail) && !ipipe->writers)
+ if (pipe_empty(ipipe) && !ipipe->writers)
break;
/*
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (pipe_empty(i_head, i_tail) ||
- pipe_full(o_head, o_tail, opipe->max_usage)) {
+ if (pipe_empty(ipipe) || full) {
/* Already processed some buffers, break */
- if (ret)
+ if (spliced)
break;
- if (flags & SPLICE_F_NONBLOCK) {
- ret = -EAGAIN;
+ ret = -EAGAIN;
+ if (flags & SPLICE_F_NONBLOCK)
break;
- }
/*
* We raced with another reader/writer and haven't
@@ -1631,50 +1504,26 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
goto retry;
}
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_head_buf(ipipe);
+ if (ibuf->size <= len - spliced) {
+ /* Simply move the whole buffer from ipipe to opipe */
+ spin_lock_irq(&ipipe->rd_wait.lock);
+ ipipe->footprint -= ibuf->footprint;
+ list_del(&ibuf->queue_link);
+ spin_unlock_irq(&ipipe->rd_wait.lock);
- if (len >= ibuf->len) {
- /*
- * Simply move the whole buffer from ipipe to opipe
- */
- *obuf = *ibuf;
- ibuf->ops = NULL;
- i_tail++;
- ipipe->tail = i_tail;
- input_wakeup = true;
- o_len = obuf->len;
- o_head++;
- opipe->head = o_head;
+ spliced += pipe_add(opipe, ibuf, &full);
} else {
/*
- * Get a reference to this pipe buffer,
- * so we can copy the contents over.
+ * Need to split the pipe buffer. Multiple folios may
+ * be involved.
*/
- if (!pipe_buf_get(ipipe, ibuf)) {
- if (ret == 0)
- ret = -EFAULT;
- break;
- }
- *obuf = *ibuf;
+ splice_split_buffer(ibuf, spare, len - spliced);
- /*
- * Don't inherit the gift and merge flags, we need to
- * prevent multiple steals of this page.
- */
- obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
- obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
-
- obuf->len = len;
- ibuf->offset += len;
- ibuf->len -= len;
- o_len = len;
- o_head++;
- opipe->head = o_head;
+ spliced += pipe_add(opipe, spare, &full);
+ spare = NULL;
}
- ret += o_len;
- len -= o_len;
- } while (len);
+ } while (spliced < len);
pipe_unlock(ipipe);
pipe_unlock(opipe);
@@ -1682,13 +1531,16 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0)
+ if (spliced)
wakeup_pipe_readers(opipe);
if (input_wakeup)
wakeup_pipe_writers(ipipe);
- return ret;
+out:
+ if (spare)
+ pipe_buf_release(opipe, spare);
+ return spliced ?: ret;
}
/*
@@ -1731,8 +1583,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* If we have iterated all input buffers or run out of
* output room, break.
*/
- if (pipe_empty(i_head, i_tail) ||
- pipe_full(o_head, o_tail, opipe->max_usage))
+ if (pipe_empty(i_pipe) ||
+ pipe_full(o_pipe))
break;
ibuf = &ipipe->bufs[i_tail & i_mask];
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 555aae54..6c030a2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -18,6 +18,7 @@ struct page;
/**
* struct bio_vec - a contiguous range of physical memory addresses
+ * @bv_folio: First folio associated with the address range.
* @bv_page: First page associated with the address range.
* @bv_len: Number of bytes in the address range.
* @bv_offset: Start of the address range relative to the start of @bv_page.
@@ -29,7 +30,10 @@ struct page;
* This holds because page_is_mergeable() checks the above property.
*/
struct bio_vec {
- struct page *bv_page;
+ union {
+ struct folio *bv_folio;
+ struct page *bv_page;
+ };
unsigned int bv_len;
unsigned int bv_offset;
};
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 569483e..2e33c66d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -12,25 +12,38 @@
#define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */
+#define PIPE_BUF_FLAG_IX_STOLEN 0x40 /* The folio at bvec[index] has been stolen/copied */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */
#endif
/**
* struct pipe_buffer - a linux kernel pipe buffer
- * @page: the page containing the data for the pipe buffer
- * @offset: offset of data inside the @page
- * @len: length of data inside the @page
+ * @queue_link: Link in pipe_inode_info::queue_link
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
+ * @private_2: Additional private data owned by the ops.
+ * @size: Size of the buffer
+ * @footprint: Amount of memory pinned by this buffer in pages
+ * @max: The size of bvec[]
+ * @index: Current element in bvec[] to consume.
+ * @nr: bvec[] count.
+ * @bvec: List of buffer folios
**/
struct pipe_buffer {
- struct page *page;
- unsigned int offset, len;
const struct pipe_buf_operations *ops;
- unsigned int flags;
- unsigned long private;
+ struct list_head queue_link;
+ size_t size;
+ size_t footprint;
+ void *private;
+ unsigned long private_2;
+ unsigned int flags;
+ unsigned short index;
+ unsigned short max;
+ unsigned short nr;
+ unsigned short nr_confirmed;
+ struct bio_vec bvec[0];
};
/*
@@ -46,10 +59,10 @@ struct pipe_buffer {
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
- * and that the contents are good. If the pages in the pipe belong
+ * and that the contents are good. If the folios in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
- * error. If not present all pages are considered good.
+ * error. If not present all folios are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
@@ -62,17 +75,25 @@ struct pipe_buf_operations {
/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
- * of the pipe (the buf->page) is locked and now completely owned by the
- * caller. The page may then be transferred to a different mapping, the
- * most often used case is insertion into different file address space
- * cache.
+ * of the pipe (the bvec[]) is locked and now completely owned by the
+ * caller. The folios may then be transferred to a different mapping,
+ * the most often used case is insertion into different file address
+ * space cache.
*/
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* Get a reference to the pipe buffer.
*/
- bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
+ bool (*get_pages)(struct pipe_inode_info *pipe, struct pipe_buffer *buf);
+
+ /*
+ * Copy data out of the pipe buffer, performing any confirmatory step
+ * necessary beforehand and releasing any used up bufferage. The
+ * caller will dispose of the buffer when buf->size reduces to zero.
+ */
+ ssize_t (*copy_to_iter)(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, struct iov_iter *iter);
};
/**
@@ -89,20 +110,6 @@ static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
}
/**
- * pipe_buf_release - put a reference to a pipe_buffer
- * @pipe: the pipe that the buffer belongs to
- * @buf: the buffer to put a reference to
- */
-static inline void pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- const struct pipe_buf_operations *ops = buf->ops;
-
- buf->ops = NULL;
- ops->release(pipe, buf);
-}
-
-/**
* pipe_buf_confirm - verify contents of the pipe buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to confirm
@@ -137,6 +144,7 @@ ssize_t pipe_add(struct pipe_inode_info *pipe, struct pipe_buffer *buf, bool *fu
#ifdef CONFIG_WATCH_QUEUE
void pipe_set_lost_mark(struct pipe_inode_info *pipe);
#endif
+void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf);
/* Get data from a pipe */
size_t pipe_query_content(struct pipe_inode_info *pipe, size_t *len);
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 9ed729e..9f6f5ef 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -45,24 +45,9 @@ struct splice_desc {
bool steal_or_copy; /* Want the pages pre-stealing or copying */
};
-struct partial_page {
- unsigned int offset;
- unsigned int len;
- unsigned long private;
-};
-
/*
* Passed to splice_to_pipe
*/
-struct splice_pipe_desc {
- struct page **pages; /* page map */
- struct partial_page *partial; /* pages[] may not be contig */
- int nr_pages; /* number of populated pages in map */
- unsigned int nr_pages_max; /* pages[] & partial[] arrays size */
- const struct pipe_buf_operations *ops;/* ops associated with output pipe */
- void (*spd_release)(struct splice_pipe_desc *, unsigned int);
-};
-
typedef int (splice_actor)(struct pipe_inode_info *pipe, struct splice_desc *sd,
unsigned int nr_bv, struct bio_vec *bv);
typedef int (splice_direct_actor)(struct pipe_inode_info *,
@@ -72,11 +57,7 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int,
splice_actor);
extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
- struct splice_desc *, splice_actor);
-extern ssize_t splice_to_pipe(struct pipe_inode_info *,
- struct splice_pipe_desc *);
-extern ssize_t add_to_pipe(struct pipe_inode_info *,
- struct pipe_buffer *);
+ struct splice_desc *, splice_actor *);
long vfs_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
@@ -94,8 +75,8 @@ extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
/*
* for dynamic pipe sizing
*/
-extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *);
-extern void splice_shrink_spd(struct splice_pipe_desc *);
+int splice_grow_buf(const struct pipe_inode_info *pipe, struct pipe_buffer *buf);
+void splice_shrink_buf(struct pipe_buffer *buf);
extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
extern const struct pipe_buf_operations default_pipe_buf_ops;
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index fc6bba2..cef9383 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -77,16 +77,16 @@ struct watch_list {
spinlock_t lock;
};
-extern void __post_watch_notification(struct watch_list *,
- struct watch_notification *,
- const struct cred *,
- u64);
+extern void __post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ gfp_t gfp,
+ u64 id);
extern struct watch_queue *get_watch_queue(int);
extern void put_watch_queue(struct watch_queue *);
extern void init_watch(struct watch *, struct watch_queue *);
extern int add_watch_to_object(struct watch *, struct watch_list *);
extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool);
-extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int);
extern long watch_queue_set_filter(struct pipe_inode_info *,
struct watch_notification_filter __user *);
extern int watch_queue_init(struct pipe_inode_info *);
@@ -103,10 +103,11 @@ static inline void init_watch_list(struct watch_list *wlist,
static inline void post_watch_notification(struct watch_list *wlist,
struct watch_notification *n,
const struct cred *cred,
+ gfp_t gfp,
u64 id)
{
if (unlikely(wlist))
- __post_watch_notification(wlist, n, cred, id);
+ __post_watch_notification(wlist, n, cred, gfp, id);
}
static inline void remove_watch_list(struct watch_list *wlist, u64 id)
diff --git a/kernel/relay.c b/kernel/relay.c
index 9aa70ae..5e48d03 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1086,9 +1086,12 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct rchan_buf *rbuf;
+ unsigned int size = buf->bvec[buf->index].bv_len;
- rbuf = (struct rchan_buf *)page_private(buf->page);
- relay_consume_bytes(rbuf, buf->private);
+ rbuf = buf->private;
+ if (buf->index == buf->nr - 1)
+ size += buf->private_2; /* Account for end padding */
+ relay_consume_bytes(rbuf, size);
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
@@ -1097,10 +1100,6 @@ static const struct pipe_buf_operations relay_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
/*
* subbuf_splice_actor - splice up to one subbuf's worth of data
*/
@@ -1112,6 +1111,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
int *nonpad_ret)
{
unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
+ struct pipe_buffer *buf;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
@@ -1120,22 +1120,12 @@ static ssize_t subbuf_splice_actor(struct file *in,
size_t read_subbuf = read_start / subbuf_size;
size_t padding = rbuf->padding[read_subbuf];
size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .nr_pages = 0,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .partial = partial,
- .ops = &relay_pipe_buf_ops,
- .spd_release = relay_page_release,
- };
- ssize_t ret;
+ ssize_t spliced = 0;
+ bool full = false;
+ int ret = 0;
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
return 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
/*
* Adjust read len, if longer than what is available
@@ -1146,54 +1136,57 @@ static ssize_t subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
+ nr_pages = min_t(unsigned int, subbuf_pages, PIPE_DEF_BUFFERS);
- for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
- unsigned int this_len, this_end, private;
+ buf = pipe_alloc_buffer(pipe, &relay_pipe_buf_ops, nr_pages,
+ GFP_KERNEL, &ret);
+ if (!buf)
+ return ret;
+
+ buf->private = rbuf;
+ /* buf->private_2 = 0; -- The amount of padding after the last segment */
+
+ for (total_len = 0; buf->nr < nr_pages;) {
+ struct folio *folio;
+ unsigned int this_len, this_end;
unsigned int cur_pos = read_start + total_len;
if (!len)
break;
this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
- private = this_len;
-
- spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
- spd.partial[spd.nr_pages].offset = poff;
this_end = cur_pos + this_len;
if (this_end >= nonpad_end) {
this_len = nonpad_end - cur_pos;
- private = this_len + padding;
+ buf->private_2 = padding;
}
- spd.partial[spd.nr_pages].len = this_len;
- spd.partial[spd.nr_pages].private = private;
+
+ folio = page_folio(rbuf->page_array[pidx]);
+
+ bvec_set_folio(&buf->bvec[buf->nr], folio, this_len, poff);
+ buf->footprint += folio_nr_pages(folio);
+ // TODO: Take page ref?
len -= this_len;
total_len += this_len;
poff = 0;
pidx = (pidx + 1) % subbuf_pages;
+ buf->nr++;
- if (this_end >= nonpad_end) {
- spd.nr_pages++;
+ if (this_end >= nonpad_end)
break;
- }
}
- ret = 0;
- if (!spd.nr_pages)
+ spliced = *nonpad_ret = pipe_add(pipe, buf, &full);
+ if (spliced < total_len)
goto out;
- ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
- if (ret < 0 || ret < total_len)
- goto out;
-
- if (read_start + ret == nonpad_end)
- ret += padding;
+ if (read_start + spliced == nonpad_end)
+ spliced += padding;
out:
- splice_shrink_spd(&spd);
- return ret;
+ return spliced;
}
static ssize_t relay_file_splice_read(struct file *in,
@@ -1228,10 +1221,7 @@ static ssize_t relay_file_splice_read(struct file *in,
nonpad_ret = 0;
}
- if (spliced)
- return spliced;
-
- return ret;
+ return spliced ?: ret;
}
const struct file_operations relay_file_operations = {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45551c7..3833744 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6399,7 +6399,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -6901,12 +6901,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
return sret;
}
-static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
- unsigned int idx)
-{
- __free_page(spd->pages[idx]);
-}
-
static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
@@ -6960,23 +6954,17 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
size_t len,
unsigned int flags)
{
- struct page *pages_def[PIPE_DEF_BUFFERS];
- struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct pipe_buffer *buf;
struct trace_iterator *iter = filp->private_data;
- struct splice_pipe_desc spd = {
- .pages = pages_def,
- .partial = partial_def,
- .nr_pages = 0, /* This gets updated below. */
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &default_pipe_buf_ops,
- .spd_release = tracing_spd_release_pipe,
- };
ssize_t ret;
size_t rem;
+ bool full = false;
unsigned int i;
- if (splice_grow_spd(pipe, &spd))
+ buf = kzalloc(struct_size(buf, bvec, PIPE_DEF_BUFFERS), GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
+ buf->ops = &default_pipe_buf_ops;
mutex_lock(&iter->mutex);
@@ -7000,42 +6988,41 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
- spd.pages[i] = alloc_page(GFP_KERNEL);
- if (!spd.pages[i])
+ for (i = 0, rem = len; i < PIPE_DEF_BUFFERS && rem; i++) {
+ struct folio *folio;
+ void *p;
+
+ folio = folio_alloc(GFP_KERNEL, 0);
+ if (!folio)
break;
rem = tracing_fill_pipe_page(rem, iter);
/* Copy the data into the page, so we can start over. */
- ret = trace_seq_to_buffer(&iter->seq,
- page_address(spd.pages[i]),
+ p = kmap_local_folio(folio, 0);
+ ret = trace_seq_to_buffer(&iter->seq, p,
trace_seq_used(&iter->seq));
+ kunmap_local(p);
if (ret < 0) {
- __free_page(spd.pages[i]);
+ folio_put(folio);
break;
}
- spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ bvec_set_folio(&buf->bvec[i], folio, trace_seq_used(&iter->seq), 0);
trace_seq_init(&iter->seq);
}
+ buf->nr = i;
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
mutex_unlock(&iter->mutex);
- spd.nr_pages = i;
-
- if (i)
- ret = splice_to_pipe(pipe, &spd);
- else
- ret = 0;
+ ret = pipe_add(pipe, buf, &full);
out:
- splice_shrink_spd(&spd);
return ret;
out_err:
+ pipe_add(pipe, buf, &full);
mutex_unlock(&iter->mutex);
goto out;
}
@@ -8297,19 +8284,6 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.get = buffer_pipe_buf_get,
};
-/*
- * Callback from splice_to_pipe(), if we need to release some pages
- * at the end of the spd in case we error'ed out in filling the pipe.
- */
-static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
-{
- struct buffer_ref *ref =
- (struct buffer_ref *)spd->partial[i].private;
-
- buffer_ref_release(ref);
- spd->partial[i].private = 0;
-}
-
static ssize_t
tracing_buffers_splice_read(struct file *file, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
@@ -8317,23 +8291,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- struct partial_page partial_def[PIPE_DEF_BUFFERS];
- struct page *pages_def[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages_def,
- .partial = partial_def,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &buffer_pipe_buf_ops,
- .spd_release = buffer_spd_release,
- };
- struct buffer_ref *ref;
- int entries, i;
- ssize_t ret = 0;
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr)
- return -EBUSY;
-#endif
+ struct pipe_buffer *buf;
+ struct buffer_ref **refs;
+ ssize_t spliced;
+ bool full = false;
+ int ret = 0, entries;
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -8344,15 +8306,31 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
- if (splice_grow_spd(pipe, &spd))
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ return -EBUSY;
+#endif
+
+ buf = pipe_alloc_buffer(pipe, &buffer_pipe_buf_ops, PIPE_DEF_BUFFERS,
+ GFP_KERNEL, &ret);
+ if (!buf)
+ return ret;
+
+ refs = kcalloc(sizeof(*refs), PIPE_DEF_BUFFERS, GFP_KERNEL);
+ if (!refs) {
+ pipe_add(pipe, buf, &full);
return -ENOMEM;
+ }
+
+ buf->private_2 = (unsigned long)refs;
again:
trace_access_lock(iter->cpu_file);
- entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
- struct page *page;
+ while (len && buf->nr < buf->max &&
+ (entries = ring_buffer_entries_cpu(iter->array_buffer->buffer,
+ iter->cpu_file))) {
+ struct buffer_ref *ref;
int r;
ref = kzalloc(sizeof(*ref), GFP_KERNEL);
@@ -8381,28 +8359,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
-
- spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
- spd.partial[i].offset = 0;
- spd.partial[i].private = (unsigned long)ref;
- spd.nr_pages++;
+ bvec_set_page(&buf->bvec[buf->nr], ref->page, PAGE_SIZE, 0);
+ refs[buf->nr] = ref; // TODO: Use page->private?
+ buf->nr++;
+ buf->size += PAGE_SIZE;
*ppos += PAGE_SIZE;
-
- entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
+ len -= PAGE_SIZE;
}
trace_access_unlock(iter->cpu_file);
- spd.nr_pages = i;
/* did we read anything? */
- if (!spd.nr_pages) {
+ if (!ret && !buf->nr) {
long wait_index;
- if (ret)
- goto out;
-
ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
@@ -8425,11 +8395,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
goto again;
}
- ret = splice_to_pipe(pipe, &spd);
out:
- splice_shrink_spd(&spd);
-
- return ret;
+ spliced = pipe_add(pipe, buf, &full);
+ return spliced ?: ret;
}
/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index df89779..0541e3e 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -26,6 +26,7 @@
#include <linux/sched/signal.h>
#include <linux/watch_queue.h>
#include <linux/pipe_fs_i.h>
+#include <linux/uio.h>
#include "../fs/pipe.h"
#include "../fs/internal.h"
@@ -33,9 +34,6 @@ MODULE_DESCRIPTION("Watch queue");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
-#define WATCH_QUEUE_NOTE_SIZE 128
-#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
-
/*
* This must be called under the RCU read-lock, which makes
* sure that the wqueue still exists. It can then take the lock,
@@ -57,37 +55,45 @@ static inline void unlock_wqueue(struct watch_queue *wqueue)
spin_unlock_bh(&wqueue->lock);
}
+static ssize_t watchqueue_copy_buf_to_iter(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ struct iov_iter *iter)
+{
+ const struct watch_notification *n = buf->private;
+ struct watch_notification hdr;
+ size_t size = buf->size, body = size - sizeof(hdr);
+ __u32 id = buf->private_2 & 0xff;
+
+ /* Substitute the ID at the point of copying so the notification buffer
+ * can be shared
+ */
+ hdr = *n;
+ hdr.info &= ~WATCH_INFO_ID;
+ hdr.info |= id << WATCH_INFO_ID__SHIFT;
+
+ if (size > iov_iter_count(iter))
+ return -ENOBUFS; /* All or nothing */
+ if (copy_to_iter(&hdr, sizeof(hdr), iter) != sizeof(hdr))
+ return -EFAULT;
+ if (size > sizeof(hdr) &&
+ copy_to_iter(buf->private + sizeof(hdr), body, iter) != body)
+ return -EFAULT;
+ kfree(buf->private);
+ buf->size = 0;
+ return size;
+}
+
static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct watch_queue *wqueue = (struct watch_queue *)buf->private;
- struct page *page;
- unsigned int bit;
-
- /* We need to work out which note within the page this refers to, but
- * the note might have been maximum size, so merely ANDing the offset
- * off doesn't work. OTOH, the note must've been more than zero size.
- */
- bit = buf->offset + buf->len;
- if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
- bit -= WATCH_QUEUE_NOTE_SIZE;
- bit /= WATCH_QUEUE_NOTE_SIZE;
-
- page = buf->page;
- bit += page->index;
-
- set_bit(bit, wqueue->notes_bitmap);
- generic_pipe_buf_release(pipe, buf);
+ kfree(buf->private);
}
-// No try_steal function => no stealing
-#define watch_queue_pipe_buf_try_steal NULL
-
/* New data written to a pipe may be appended to a buffer with this type. */
static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
.release = watch_queue_pipe_buf_release,
- .try_steal = watch_queue_pipe_buf_try_steal,
.get = generic_pipe_buf_get,
+ .copy_to_iter = watchqueue_copy_buf_to_iter,
};
/*
@@ -97,62 +103,52 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
* watch_queue lock held, which guarantees that the pipe
* hasn't been released.
*/
-static bool post_one_notification(struct watch_queue *wqueue,
- struct watch_notification *n)
+static void post_one_notification(struct watch_queue *wqueue,
+ struct watch_notification *n,
+ unsigned int id)
{
- void *p;
+ struct watch_notification *buf2;
struct pipe_inode_info *pipe = wqueue->pipe;
struct pipe_buffer *buf;
- struct page *page;
- unsigned int note, offset, len;
- bool done = false, full = false;
+ unsigned int len = n->info & WATCH_INFO_LENGTH;
+ bool wake = false, full = false;
int error = 0;
if (!pipe)
- return false;
-
- spin_lock_irq(&pipe->rd_wait.lock);
+ return;
buf = pipe_alloc_buffer(pipe, &watch_queue_pipe_buf_ops, 1, GFP_ATOMIC,
&error);
- if (IS_ERR_OR_NULL(buf))
+ if (!buf)
+ goto lost;
+ buf2 = kmemdup(n, n->info & WATCH_INFO_LENGTH, GFP_ATOMIC);
+ if (!buf2)
goto lost;
- note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
- if (note >= wqueue->nr_notes)
- goto lost;
+ buf->flags = PIPE_BUF_FLAG_WHOLE;
+ buf->ops = &watch_queue_pipe_buf_ops;
+ buf->private = buf2;
+ buf->private_2 = id;
+ buf->size = len;
+ buf->footprint += len;
- page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
- offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
- get_page(page);
- len = n->info & WATCH_INFO_LENGTH;
- p = kmap_atomic(page);
- memcpy(p + offset, n, len);
- kunmap_atomic(p);
-
- buf->page = page;
- buf->private = (unsigned long)wqueue;
- buf->offset = offset;
- buf->len = len;
- buf->flags = PIPE_BUF_FLAG_WHOLE;
pipe_add(pipe, buf, &full);
-
- if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- BUG();
- }
- wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
- done = true;
-
-out:
+ wake = true;
spin_unlock_irq(&pipe->rd_wait.lock);
- if (done)
+
+wake:
+ if (wake) {
+ wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- return done;
+ }
+ return;
lost:
pipe_set_lost_mark(pipe);
- goto out;
+ if (buf)
+ pipe_buf_release(pipe, buf);
+ wake = true;
+ goto wake;
}
/*
@@ -186,6 +182,7 @@ static bool filter_watch_notification(const struct watch_filter *wf,
* @wlist: The watch list to post the event to.
* @n: The notification record to post.
* @cred: The creds of the process that triggered the notification.
+ * @gfp: Allocation flags for notification and pipe buf.
* @id: The ID to match on the watch.
*
* Post a notification of an event into a set of watch queues and let the users
@@ -197,6 +194,7 @@ static bool filter_watch_notification(const struct watch_filter *wf,
void __post_watch_notification(struct watch_list *wlist,
struct watch_notification *n,
const struct cred *cred,
+ gfp_t gfp,
u64 id)
{
const struct watch_filter *wf;
@@ -225,7 +223,7 @@ void __post_watch_notification(struct watch_list *wlist,
continue;
if (lock_wqueue(wqueue)) {
- post_one_notification(wqueue, n);
+ post_one_notification(wqueue, n, watch->info_id);
unlock_wqueue(wqueue);
}
}
@@ -235,75 +233,6 @@ void __post_watch_notification(struct watch_list *wlist,
EXPORT_SYMBOL(__post_watch_notification);
/*
- * Allocate sufficient pages to preallocation for the requested number of
- * notifications.
- */
-long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
-{
- struct watch_queue *wqueue = pipe->watch_queue;
- struct page **pages;
- unsigned long *bitmap;
- unsigned long user_bufs;
- int ret, i, nr_pages;
-
- if (!wqueue)
- return -ENODEV;
- if (wqueue->notes)
- return -EBUSY;
-
- if (nr_notes < 1 ||
- nr_notes > 512) /* TODO: choose a better hard limit */
- return -EINVAL;
-
- nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
- nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
- user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
-
- if (nr_pages > pipe->max_usage &&
- (too_many_pipe_buffers_hard(user_bufs) ||
- too_many_pipe_buffers_soft(user_bufs)) &&
- pipe_is_unprivileged_user()) {
- ret = -EPERM;
- goto error;
- }
-
- nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
- ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
- if (ret < 0)
- goto error;
-
- pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
- if (!pages)
- goto error;
-
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_KERNEL);
- if (!pages[i])
- goto error_p;
- pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
- }
-
- bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
- if (!bitmap)
- goto error_p;
-
- bitmap_fill(bitmap, nr_notes);
- wqueue->notes = pages;
- wqueue->notes_bitmap = bitmap;
- wqueue->nr_pages = nr_pages;
- wqueue->nr_notes = nr_notes;
- return 0;
-
-error_p:
- while (--i >= 0)
- __free_page(pages[i]);
- kfree(pages);
-error:
- (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
- return ret;
-}
-
-/*
* Set the filter on a watch queue.
*/
long watch_queue_set_filter(struct pipe_inode_info *pipe,
@@ -560,7 +489,7 @@ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
wqueue = rcu_dereference(watch->queue);
if (lock_wqueue(wqueue)) {
- post_one_notification(wqueue, &n.watch);
+ post_one_notification(wqueue, &n.watch, watch->info_id);
if (!hlist_unhashed(&watch->queue_node)) {
hlist_del_init_rcu(&watch->queue_node);
diff --git a/mm/filemap.c b/mm/filemap.c
index b75a4b7..8f34ab4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2841,39 +2841,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
EXPORT_SYMBOL(generic_file_read_iter);
/*
- * Splice subpages from a folio into a pipe.
- */
-size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
- struct folio *folio, loff_t fpos, size_t size,
- bool *full, int *error)
-{
- struct pipe_buffer *buf;
- struct page *page;
- size_t spliced = 0, offset = offset_in_folio(folio, fpos);
-
- page = folio_page(folio, offset / PAGE_SIZE);
- size = min(size, folio_size(folio) - offset);
- offset %= PAGE_SIZE;
-
- while (spliced < size &&
- (buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, 1,
- GFP_KERNEL, error))) {
- size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
-
- buf->page = page++;
- buf->offset = offset;
- buf->len = part;
- folio_get(folio);
- spliced += pipe_add(pipe, buf, full);
- offset = 0;
- if (*full)
- break;
- }
-
- return spliced;
-}
-
-/*
* Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
* a pipe.
*/
@@ -2881,9 +2848,10 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
+ struct pipe_buffer *buf = NULL;
struct folio_batch fbatch;
struct kiocb iocb;
- size_t total_spliced = 0;
+ size_t total_spliced = 0, max_pages;
loff_t isize, end_offset;
bool writably_mapped, full = false;
int i, error = 0;
@@ -2892,11 +2860,17 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- if (!pipe_query_space(pipe, &len, &error))
+ max_pages = pipe_query_space(pipe, &len, &error);
+ if (!max_pages)
return error;
folio_batch_init(&fbatch);
+ buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, max_pages,
+ GFP_KERNEL, &error);
+ if (!buf)
+ goto out;
+
do {
cond_resched();
@@ -2929,10 +2903,11 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- size_t n;
+ struct bio_vec *bv = buf->bvec;
+ size_t n, o;
if (folio_pos(folio) >= end_offset)
- goto out;
+ break;
folio_mark_accessed(folio);
/*
@@ -2943,22 +2918,32 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
if (writably_mapped)
flush_dcache_folio(folio);
+ o = offset_in_folio(folio, *ppos);
n = min_t(loff_t, len, isize - *ppos);
- n = splice_folio_into_pipe(pipe, folio, *ppos, n,
- &full, &error);
- if (!n)
- goto out;
+ n = min_t(size_t, n, folio_size(folio) - o);
+
+ bv[buf->nr].bv_page = folio_page(folio, 0);
+ bv[buf->nr].bv_offset = offset_in_folio(folio, *ppos);
+ bv[buf->nr].bv_len = n;
+ buf->nr++;
+ buf->size += n;
+ buf->footprint += folio_size(folio);
+
+ folio_get(folio); //fbatch.folios[i] = NULL;
+
len -= n;
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (full)
- goto out;
+
+ if (buf->footprint >= max_pages)
+ break;
}
folio_batch_release(&fbatch);
} while (len);
+ pipe_add(pipe, buf, &full);
out:
folio_batch_release(&fbatch);
file_accessed(in);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9fa333e..bc736cb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2744,15 +2744,6 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
}
EXPORT_SYMBOL(skb_copy_bits);
-/*
- * Callback from splice_to_pipe(), if we need to release some pages
- * at the end of the spd in case we error'ed out in filling the pipe.
- */
-static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
-{
- put_page(spd->pages[i]);
-}
-
static struct page *linear_to_page(struct page *page, unsigned int *len,
unsigned int *offset,
struct sock *sk)
@@ -2772,26 +2763,27 @@ static struct page *linear_to_page(struct page *page, unsigned int *len,
return pfrag->page;
}
-static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+static bool spd_can_coalesce(const struct pipe_buffer *buf,
struct page *page,
unsigned int offset)
{
- return spd->nr_pages &&
- spd->pages[spd->nr_pages - 1] == page &&
- (spd->partial[spd->nr_pages - 1].offset +
- spd->partial[spd->nr_pages - 1].len == offset);
+ const struct bio_vec *p = &buf->bvec[buf->nr - 1];
+
+ return buf->nr &&
+ p->bv_page == page &&
+ p->bv_offset + p->bv_len == offset;
}
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static bool spd_fill_page(struct splice_pipe_desc *spd,
+static bool spd_fill_page(struct pipe_buffer *buf,
struct pipe_inode_info *pipe, struct page *page,
unsigned int *len, unsigned int offset,
bool linear,
struct sock *sk)
{
- if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ if (unlikely(buf->nr == MAX_SKB_FRAGS))
return true;
if (linear) {
@@ -2799,23 +2791,22 @@ static bool spd_fill_page(struct splice_pipe_desc *spd,
if (!page)
return true;
}
- if (spd_can_coalesce(spd, page, offset)) {
- spd->partial[spd->nr_pages - 1].len += *len;
+ if (spd_can_coalesce(buf, page, offset)) {
+ buf->bvec[buf->nr - 1].bv_len += *len;
return false;
}
get_page(page);
- spd->pages[spd->nr_pages] = page;
- spd->partial[spd->nr_pages].len = *len;
- spd->partial[spd->nr_pages].offset = offset;
- spd->nr_pages++;
-
+ buf->bvec[buf->nr].bv_page = page;
+ buf->bvec[buf->nr].bv_len = *len;
+ buf->bvec[buf->nr].bv_offset = offset;
+ buf->nr++;
return false;
}
static bool __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len,
- struct splice_pipe_desc *spd, bool linear,
+ struct pipe_buffer *buf, bool linear,
struct sock *sk,
struct pipe_inode_info *pipe)
{
@@ -2836,8 +2827,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
do {
unsigned int flen = min(*len, plen);
- if (spd_fill_page(spd, pipe, page, &flen, poff,
- linear, sk))
+ if (spd_fill_page(buf, pipe, page, &flen, poff, linear, sk))
return true;
poff += flen;
plen -= flen;
@@ -2853,7 +2843,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
*/
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
unsigned int *offset, unsigned int *len,
- struct splice_pipe_desc *spd, struct sock *sk)
+ struct pipe_buffer *buf, struct sock *sk)
{
int seg;
struct sk_buff *iter;
@@ -2866,7 +2856,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, spd,
+ offset, len, buf,
skb_head_is_locked(skb),
sk, pipe))
return true;
@@ -2879,7 +2869,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
- offset, len, spd, false, sk, pipe))
+ offset, len, buf, false, sk, pipe))
return true;
}
@@ -2892,7 +2882,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
* left, so no point in going over the frag_list for the error
* case.
*/
- if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ if (__skb_splice_bits(iter, pipe, offset, len, buf, sk))
return true;
}
@@ -2907,23 +2897,17 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
- struct partial_page partial[MAX_SKB_FRAGS];
- struct page *pages[MAX_SKB_FRAGS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
- .nr_pages_max = MAX_SKB_FRAGS,
- .ops = &nosteal_pipe_buf_ops,
- .spd_release = sock_spd_release,
- };
+ struct pipe_buffer *buf;
+ bool full = false;
int ret = 0;
- __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
+ buf = pipe_alloc_buffer(pipe, &nosteal_pipe_buf_ops, MAX_SKB_FRAGS,
+ GFP_KERNEL, &ret);
+ if (!buf)
+ return ret;
- if (spd.nr_pages)
- ret = splice_to_pipe(pipe, &spd);
-
- return ret;
+ __skb_splice_bits(skb, pipe, &offset, &tlen, buf, sk);
+ return pipe_add(pipe, buf, &full);
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 3c1e712..990edf7 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -195,7 +195,7 @@ static inline void notify_key(struct key *key,
};
post_watch_notification(key->watchers, &n.watch, current_cred(),
- n.key_id);
+ n.key_id, GFP_KERNEL);
#endif
}