How efficient can cat(1) be?
There have been a few initiatives in recent years to implement a new userspace base system for Linux distributions as an alternative to the GNU coreutils and BusyBox. Recently, one of the authors of one of these proposed implementations made the pitch in a few IRC channels that her cat implementation, which was derived from OpenBSD’s implementation, was the most efficient. But is it actually?
Understanding what cat actually does
At the most basic level, cat takes one or more files and
dumps them to stdout.  But do we need to actually use stdio
for this?  Actually, we don’t, and most competent cat
implementations at least use read(2) and write(2) if not
more advanced approaches.
If we consider cat as a form of buffer copy between an
arbitrary file descriptor and STDOUT_FILENO, we can understand
what the most efficient strategy to use for cat would be: splicing.
Anything which isn’t doing splicing, after all, involves unnecessary
buffer copies, and thus cannot be the most efficient.
To get the best performance out of spliced I/O, we have to have some prerequisites:
- 
The source and destination file descriptors should be unbuffered. 
- 
Any intermediate buffer should be a multiple of the filesystem block size. In general, to avoid doing a statsyscall, we can assume that a multiple ofPAGE_SIZEis likely acceptable.
A simple cat implementation
The simplest way to implement cat is the way that it is done in
BSD: using read and write on an intermediate buffer.  This
results in two buffer copies, but has the best portability.  Most
implementations of cat work this way, as it generally offers
good enough performance.
/* This program is released into the public domain. */
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];
	ssize_t nread, nwritten;
	size_t offset;
	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);
	}
	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0; nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO, buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}
	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}
int main(int argc, const char *argv[])
{
	int i;
	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);
	return EXIT_SUCCESS;
}
Implementing spliced I/O
Linux has no shortage of ways to perform spliced I/O.  For our cat
implementation, we have two possible ways to do it.
The first possible option is the venerable sendfile syscall, which
was originally added to improve the file serving performance of web
servers. Originally, sendfile required the destination
file descriptor to be a socket, but this restriction was removed in
Linux 2.6.33.  Unfortunately, sendfile is not perfect: because it
only supports file descriptors which can be memory mapped, we
must use a different strategy when using copying from stdin.
/* This program is released into the public domain. */
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>
bool spliced_copy(int srcfd)
{
	ssize_t nwritten;
	off_t offset = 0;
	do
	{
		nwritten = sendfile(STDOUT_FILENO, srcfd, &offset,
				    PAGE_SIZE * 16);
		if (nwritten < 0)
			return false;
	} while (nwritten > 0);
	return true;
}
void copy(int srcfd)
{
	char buf[PAGE_SIZE * 16];
	size_t nread, nwritten, offset;
	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0;
		     nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO,
					      buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}
}
void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];
	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);
	}
	/* Fall back to traditional copy if the spliced version fails. */
	if (!spliced_copy(srcfd))
		copy(srcfd);
	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}
int main(int argc, const char *argv[])
{
	int i;
	int stdout_flags;
	stdout_flags = fcntl(STDOUT_FILENO, F_GETFL);
	if (stdout_flags < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_GETFL)");
	stdout_flags &= ~O_APPEND;
	if (fcntl(STDOUT_FILENO, F_SETFL, stdout_flags) < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_SETFL)");
	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);
	return EXIT_SUCCESS;
}
Another approach is to use splice and a pipe.  This allows for true
zero-copy I/O in userspace, as a pipe is simply implemented as a 64KB
ring buffer in the kernel.  In this case, we just use two splice
operations per block of data we want to copy: one to move the data to
the pipe and another to move the data from the pipe to the output file.
/* This program is released into the public domain. */
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>
#define BLOCK_SIZE ((PAGE_SIZE * 16) - 1)
bool spliced_copy(int srcfd)
{
	int pipefd[2];
	ssize_t nread, nwritten;
	off_t in_offset = 0;
	bool ret = true;
	if (pipe(pipefd) < 0)
		err(EXIT_FAILURE, "pipe");
	do
	{
		nread = splice(srcfd, &in_offset, pipefd[1], NULL,
			       BLOCK_SIZE, SPLICE_F_MOVE | SPLICE_F_MORE);
		if (nread <= 0)
		{
			ret = nread < 0 ? false : true;
			goto out;
		}
		nwritten = splice(pipefd[0], NULL, STDOUT_FILENO, NULL,
				  BLOCK_SIZE, SPLICE_F_MOVE | SPLICE_F_MORE);
		if (nwritten < 0)
		{
			ret = false;
			goto out;
		}
	} while (nwritten > 0);
out:
	close(pipefd[0]);
	close(pipefd[1]);
	return ret;
}
void copy(int srcfd)
{
	char buf[PAGE_SIZE * 16];
	size_t nread, nwritten, offset;
	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0;
		     nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO,
					      buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}
}
void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];
	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);
		(void) posix_fadvise(srcfd, 0, 0, POSIX_FADV_SEQUENTIAL);
	}
	/* Fall back to traditional copy if the spliced version fails. */
	if (!spliced_copy(srcfd))
		copy(srcfd);
	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}
int main(int argc, const char *argv[])
{
	int i;
	int stdout_flags;
	stdout_flags = fcntl(STDOUT_FILENO, F_GETFL);
	if (stdout_flags < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_GETFL)");
	stdout_flags &= ~O_APPEND;
	if (fcntl(STDOUT_FILENO, F_SETFL, stdout_flags) < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_SETFL)");
	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);
	return EXIT_SUCCESS;
}
Honorable mention: copy_file_range
While copy_file_range is not really that relevant to a cat
implementation, if both the source and output files are normal
files, you can use it to get even faster performance than using
splice, as the kernel handles all of the details on its own.  An
optimized cat might try this strategy and then downgrade to
splice, sendfile, and the normal read and write loop.
Performance comparison
To measure the performance of each strategy, we can simply use
dd as a sink, running each cat program piped into
dd of=/dev/null bs=64K iflag=fullblock.
The runs in the table below are averaged across 1000 runs on a 8GB RAM Linode, using a 4GB file in tmpfs.
| Strategy | Throughput | 
|---|---|
| cat-simple(readandwriteloop) | 3.6 GB/s | 
| cat-sendfile | 6.4 GB/s | 
| cat-splice | 11.6 GB/s | 
If you are interested in using these implementations in your own
cat implementation, you may do so under any license terms you
wish.