How efficient can cat(1) be?

Jul 17, 2022

There have been a few initiatives in recent years to implement a new userspace base system for Linux distributions as an alternative to the GNU coreutils and BusyBox. Recently, one of the authors of one of these proposed implementations made the pitch in a few IRC channels that her cat implementation, which was derived from OpenBSD’s implementation, was the most efficient. But is it actually?

Understanding what `cat` actually does

At the most basic level, cat takes one or more files and dumps them to stdout. But do we need to actually use stdio for this? Actually, we don’t, and most competent cat implementations at least use read(2) and write(2) if not more advanced approaches.

If we consider cat as a form of buffer copy between an arbitrary file descriptor and STDOUT_FILENO, we can understand what the most efficient strategy to use for cat would be: splicing. Anything which isn’t doing splicing, after all, involves unnecessary buffer copies, and thus cannot be the most efficient.

To get the best performance out of spliced I/O, we have to have some prerequisites:

The source and destination file descriptors should be unbuffered.
Any intermediate buffer should be a multiple of the filesystem block size. In general, to avoid doing a stat syscall, we can assume that a multiple of PAGE_SIZE is likely acceptable.

A simple `cat` implementation

The simplest way to implement cat is the way that it is done in BSD: using read and write on an intermediate buffer. This results in two buffer copies, but has the best portability. Most implementations of cat work this way, as it generally offers good enough performance.

/* This program is released into the public domain. */
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>

void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];
	ssize_t nread, nwritten;
	size_t offset;

	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);
	}

	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0; nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO, buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}

	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}

int main(int argc, const char *argv[])
{
	int i;

	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);

	return EXIT_SUCCESS;
}

Implementing spliced I/O

Linux has no shortage of ways to perform spliced I/O. For our cat implementation, we have two possible ways to do it.

The first possible option is the venerable sendfile syscall, which was originally added to improve the file serving performance of web servers. Originally, sendfile required the destination file descriptor to be a socket, but this restriction was removed in Linux 2.6.33. Unfortunately, sendfile is not perfect: because it only supports file descriptors which can be memory mapped, we must use a different strategy when using copying from stdin.

/* This program is released into the public domain. */
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>

bool spliced_copy(int srcfd)
{
	ssize_t nwritten;
	off_t offset = 0;

	do
	{
		nwritten = sendfile(STDOUT_FILENO, srcfd, &offset,
				    PAGE_SIZE * 16);
		if (nwritten < 0)
			return false;
	} while (nwritten > 0);

	return true;
}

void copy(int srcfd)
{
	char buf[PAGE_SIZE * 16];
	size_t nread, nwritten, offset;

	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0;
		     nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO,
					      buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}
}

void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];

	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);
	}

	/* Fall back to traditional copy if the spliced version fails. */
	if (!spliced_copy(srcfd))
		copy(srcfd);

	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}

int main(int argc, const char *argv[])
{
	int i;
	int stdout_flags;

	stdout_flags = fcntl(STDOUT_FILENO, F_GETFL);
	if (stdout_flags < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_GETFL)");
	stdout_flags &= ~O_APPEND;
	if (fcntl(STDOUT_FILENO, F_SETFL, stdout_flags) < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_SETFL)");

	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);

	return EXIT_SUCCESS;
}

Another approach is to use splice and a pipe. This allows for true zero-copy I/O in userspace, as a pipe is simply implemented as a 64KB ring buffer in the kernel. In this case, we just use two splice operations per block of data we want to copy: one to move the data to the pipe and another to move the data from the pipe to the output file.

/* This program is released into the public domain. */
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/sendfile.h>

#define BLOCK_SIZE ((PAGE_SIZE * 16) - 1)

bool spliced_copy(int srcfd)
{
	int pipefd[2];
	ssize_t nread, nwritten;
	off_t in_offset = 0;
	bool ret = true;

	if (pipe(pipefd) < 0)
		err(EXIT_FAILURE, "pipe");

	do
	{
		nread = splice(srcfd, &in_offset, pipefd[1], NULL,
			       BLOCK_SIZE, SPLICE_F_MOVE | SPLICE_F_MORE);
		if (nread <= 0)
		{
			ret = nread < 0 ? false : true;
			goto out;
		}

		nwritten = splice(pipefd[0], NULL, STDOUT_FILENO, NULL,
				  BLOCK_SIZE, SPLICE_F_MOVE | SPLICE_F_MORE);
		if (nwritten < 0)
		{
			ret = false;
			goto out;
		}
	} while (nwritten > 0);

out:
	close(pipefd[0]);
	close(pipefd[1]);

	return ret;
}

void copy(int srcfd)
{
	char buf[PAGE_SIZE * 16];
	size_t nread, nwritten, offset;

	while ((nread = read(srcfd, buf, sizeof buf)) >= 1)
	{
		for (offset = 0; nread > 0;
		     nread -= nwritten, offset += nwritten)
		{
			if ((nwritten = write(STDOUT_FILENO,
					      buf + offset, nread)) <= 0)
				err(EXIT_FAILURE, "write stdout");
		}
	}
}

void dumpfile(const char *path)
{
	int srcfd = STDIN_FILENO;
	char buf[PAGE_SIZE * 16];

	/* POSIX allows - to represent stdin. */
	if (*path != '-')
	{
		srcfd = open(path, O_RDONLY);
		if (srcfd < 0)
			err(EXIT_FAILURE, "open %s", path);

		(void) posix_fadvise(srcfd, 0, 0, POSIX_FADV_SEQUENTIAL);
	}

	/* Fall back to traditional copy if the spliced version fails. */
	if (!spliced_copy(srcfd))
		copy(srcfd);

	if (srcfd != STDIN_FILENO)
		(void) close(srcfd);
}

int main(int argc, const char *argv[])
{
	int i;
	int stdout_flags;

	stdout_flags = fcntl(STDOUT_FILENO, F_GETFL);
	if (stdout_flags < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_GETFL)");
	stdout_flags &= ~O_APPEND;
	if (fcntl(STDOUT_FILENO, F_SETFL, stdout_flags) < 0)
		err(EXIT_FAILURE, "fcntl(STDOUT_FILENO, F_SETFL)");

	for (i = 1; i < argc; i++)
		dumpfile(argv[i]);

	return EXIT_SUCCESS;
}

Honorable mention: `copy_file_range`

While copy_file_range is not really that relevant to a cat implementation, if both the source and output files are normal files, you can use it to get even faster performance than using splice, as the kernel handles all of the details on its own. An optimized cat might try this strategy and then downgrade to splice, sendfile, and the normal read and write loop.

Performance comparison

To measure the performance of each strategy, we can simply use dd as a sink, running each cat program piped into dd of=/dev/null bs=64K iflag=fullblock.

The runs in the table below are averaged across 1000 runs on a 8GB RAM Linode, using a 4GB file in tmpfs.

Strategy	Throughput
`cat-simple` (`read` and `write` loop)	3.6 GB/s
`cat-sendfile`	6.4 GB/s
`cat-splice`	11.6 GB/s

If you are interested in using these implementations in your own cat implementation, you may do so under any license terms you wish.

How efficient can cat(1) be?

Understanding what cat actually does

A simple cat implementation

Implementing spliced I/O

Honorable mention: copy_file_range

Performance comparison

Understanding what `cat` actually does

A simple `cat` implementation

Honorable mention: `copy_file_range`