#include <sched.h>
#include <limits.h>
#include <stddef.h>
+#include <signal.h>
#include "linux/lguest_launcher.h"
#include "linux/virtio_config.h"
#include "linux/virtio_net.h"
#endif
/* We can have up to 256 pages for devices. */
#define DEVICE_PAGES 256
-/* This will occupy 2 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 128
+/* This will occupy 3 pages: it must be a power of 2. */
+#define VIRTQUEUE_NUM 256
/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
* this, and although I wouldn't recommend it, it works quite nicely here. */
do { if (verbose) printf(args); } while(0)
/*:*/
-/* The pipe to send commands to the waker process */
-static int waker_fd;
+/* File descriptors for the Waker. */
+struct {
+ int pipe[2];
+ int lguest_fd;
+} waker_fds;
+
/* The pointer to the start of guest memory. */
static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
+/* The pipe for signal hander to write to. */
+static int timeoutpipe[2];
+static unsigned int timeout_usec = 500;
/* a per-cpu variable indicating whose vcpu is currently running */
static unsigned int __thread cpu_id;
/* Last available index we saw. */
u16 last_avail_idx;
- /* The routine to call when the Guest pings us. */
- void (*handle_output)(int fd, struct virtqueue *me);
+ /* The routine to call when the Guest pings us, or timeout. */
+ void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
/* Outstanding buffers */
unsigned int inflight;
+
+ /* Is this blocked awaiting a timer? */
+ bool blocked;
};
/* Remember the arguments to the program so we can "reboot" */
* watch, but handing a file descriptor mask through to the kernel is fairly
* icky.
*
- * Instead, we fork off a process which watches the file descriptors and writes
+ * Instead, we clone off a thread which watches the file descriptors and writes
* the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
* stop running the Guest. This causes the Launcher to return from the
* /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
* the LHREQ_BREAK and wake us up again.
*
* This, of course, is merely a different *kind* of icky.
+ *
+ * Given my well-known antipathy to threads, I'd prefer to use processes. But
+ * it's easier to share Guest memory with threads, and trivial to share the
+ * devices.infds as the Launcher changes it.
*/
-static void wake_parent(int pipefd, int lguest_fd)
+static int waker(void *unused)
{
- /* Add the pipe from the Launcher to the fdset in the device_list, so
- * we watch it, too. */
- add_device_fd(pipefd);
+ /* Close the write end of the pipe: only the Launcher has it open. */
+ close(waker_fds.pipe[1]);
for (;;) {
fd_set rfds = devices.infds;
unsigned long args[] = { LHREQ_BREAK, 1 };
+ unsigned int maxfd = devices.max_infd;
+
+ /* We also listen to the pipe from the Launcher. */
+ FD_SET(waker_fds.pipe[0], &rfds);
+ if (waker_fds.pipe[0] > maxfd)
+ maxfd = waker_fds.pipe[0];
/* Wait until input is ready from one of the devices. */
- select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
- /* Is it a message from the Launcher? */
- if (FD_ISSET(pipefd, &rfds)) {
- int fd;
- /* If read() returns 0, it means the Launcher has
- * exited. We silently follow. */
- if (read(pipefd, &fd, sizeof(fd)) == 0)
- exit(0);
- /* Otherwise it's telling us to change what file
- * descriptors we're to listen to. Positive means
- * listen to a new one, negative means stop
- * listening. */
- if (fd >= 0)
- FD_SET(fd, &devices.infds);
- else
- FD_CLR(-fd - 1, &devices.infds);
- } else /* Send LHREQ_BREAK command. */
- pwrite(lguest_fd, args, sizeof(args), cpu_id);
+ select(maxfd+1, &rfds, NULL, NULL, NULL);
+
+ /* Message from Launcher? */
+ if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
+ char c;
+ /* If this fails, then assume Launcher has exited.
+ * Don't do anything on exit: we're just a thread! */
+ if (read(waker_fds.pipe[0], &c, 1) != 1)
+ _exit(0);
+ continue;
+ }
+
+ /* Send LHREQ_BREAK command to snap the Launcher out of it. */
+ pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
}
+ return 0;
}
/* This routine just sets up a pipe to the Waker process. */
-static int setup_waker(int lguest_fd)
-{
- int pipefd[2], child;
-
- /* We create a pipe to talk to the Waker, and also so it knows when the
- * Launcher dies (and closes pipe). */
- pipe(pipefd);
- child = fork();
- if (child == -1)
- err(1, "forking");
-
- if (child == 0) {
- /* We are the Waker: close the "writing" end of our copy of the
- * pipe and start waiting for input. */
- close(pipefd[1]);
- wake_parent(pipefd[0], lguest_fd);
- }
- /* Close the reading end of our copy of the pipe. */
- close(pipefd[0]);
+static void setup_waker(int lguest_fd)
+{
+ /* This pipe is closed when Launcher dies, telling Waker. */
+ if (pipe(waker_fds.pipe) != 0)
+ err(1, "Creating pipe for Waker");
+
+ /* Waker also needs to know the lguest fd */
+ waker_fds.lguest_fd = lguest_fd;
- /* Here is the fd used to talk to the waker. */
- return pipefd[1];
+ if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
+ err(1, "Creating Waker");
}
/*
unsigned long args[] = { LHREQ_BREAK, 0 };
/* Close the fd so Waker will know it has to
* exit. */
- close(waker_fd);
- /* Just in case waker is blocked in BREAK, send
+ close(waker_fds.pipe[1]);
+ /* Just in case Waker is blocked in BREAK, send
* unbreak now. */
write(fd, args, sizeof(args));
exit(2);
/* Handling output for console is simple: we just get all the output buffers
* and write them to stdout. */
-static void handle_console_output(int fd, struct virtqueue *vq)
+static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
{
unsigned int head, out, in;
int len;
}
}
+/* This is called when we no longer want to hear about Guest changes to a
+ * virtqueue. This is more efficient in high-traffic cases, but it means we
+ * have to set a timer to check if any more changes have occurred. */
+static void block_vq(struct virtqueue *vq)
+{
+ struct itimerval itm;
+
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ vq->blocked = true;
+
+ itm.it_interval.tv_sec = 0;
+ itm.it_interval.tv_usec = 0;
+ itm.it_value.tv_sec = 0;
+ itm.it_value.tv_usec = timeout_usec;
+
+ setitimer(ITIMER_REAL, &itm, NULL);
+}
+
/*
* The Network
*
* and write them (ignoring the first element) to this device's file descriptor
* (/dev/net/tun).
*/
-static void handle_net_output(int fd, struct virtqueue *vq)
+static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
{
- unsigned int head, out, in;
+ unsigned int head, out, in, num = 0;
int len;
struct iovec iov[vq->vring.num];
+ static int last_timeout_num;
/* Keep getting output buffers from the Guest until we run out. */
while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
if (in)
errx(1, "Input buffers in output queue?");
- /* Check header, but otherwise ignore it (we told the Guest we
- * supported no features, so it shouldn't have anything
- * interesting). */
- (void)convert(&iov[0], struct virtio_net_hdr);
- len = writev(vq->dev->fd, iov+1, out-1);
+ len = writev(vq->dev->fd, iov, out);
+ if (len < 0)
+ err(1, "Writing network packet to tun");
add_used_and_trigger(fd, vq, head, len);
+ num++;
+ }
+
+ /* Block further kicks and set up a timer if we saw anything. */
+ if (!timeout && num)
+ block_vq(vq);
+
+ /* We never quite know how long should we wait before we check the
+ * queue again for more packets. We start at 500 microseconds, and if
+ * we get fewer packets than last time, we assume we made the timeout
+ * too small and increase it by 10 microseconds. Otherwise, we drop it
+ * by one microsecond every time. It seems to work well enough. */
+ if (timeout) {
+ if (num < last_timeout_num)
+ timeout_usec += 10;
+ else if (timeout_usec > 1)
+ timeout_usec--;
+ last_timeout_num = num;
}
}
unsigned int head, in_num, out_num;
int len;
struct iovec iov[dev->vq->vring.num];
- struct virtio_net_hdr *hdr;
/* First we need a network buffer from the Guests's recv virtqueue. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
* early, the Guest won't be ready yet. Wait until the device
* status says it's ready. */
/* FIXME: Actually want DRIVER_ACTIVE here. */
- if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
- warn("network: no dma buffer!");
+
+ /* Now tell it we want to know if new things appear. */
+ dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ wmb();
+
/* We'll turn this back on if input buffers are registered. */
return false;
} else if (out_num)
errx(1, "Output buffers in network recv queue?");
- /* First element is the header: we set it to 0 (no features). */
- hdr = convert(&iov[0], struct virtio_net_hdr);
- hdr->flags = 0;
- hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
/* Read the packet from the device directly into the Guest's buffer. */
- len = readv(dev->fd, iov+1, in_num-1);
+ len = readv(dev->fd, iov, in_num);
if (len <= 0)
err(1, "reading network");
/* Tell the Guest about the new packet. */
- add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
+ add_used_and_trigger(fd, dev->vq, head, len);
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
/*L:215 This is the callback attached to the network and console input
* virtqueues: it ensures we try again, in case we stopped console or net
* delivery because Guest didn't have any buffers. */
-static void enable_fd(int fd, struct virtqueue *vq)
+static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
{
add_device_fd(vq->dev->fd);
- /* Tell waker to listen to it again */
- write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
+ /* Snap the Waker out of its select loop. */
+ write(waker_fds.pipe[1], "", 1);
+}
+
+static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
+{
+ /* We don't need to know again when Guest refills receive buffer. */
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ enable_fd(fd, vq, timeout);
}
/* When the Guest tells us they updated the status field, we handle it. */
if (strcmp(vq->dev->name, "console") != 0)
verbose("Output to %s\n", vq->dev->name);
if (vq->handle_output)
- vq->handle_output(fd, vq);
+ vq->handle_output(fd, vq, false);
return;
}
}
strnlen(from_guest_phys(addr), guest_limit - addr));
}
+static void handle_timeout(int fd)
+{
+ char buf[32];
+ struct device *i;
+ struct virtqueue *vq;
+
+ /* Clear the pipe */
+ read(timeoutpipe[0], buf, sizeof(buf));
+
+ /* Check each device and virtqueue: flush blocked ones. */
+ for (i = devices.dev; i; i = i->next) {
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (!vq->blocked)
+ continue;
+
+ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ vq->blocked = false;
+ if (vq->handle_output)
+ vq->handle_output(fd, vq, true);
+ }
+ }
+}
+
/* This is called when the Waker wakes us up: check for incoming file
* descriptors. */
static void handle_input(int fd)
for (;;) {
struct device *i;
fd_set fds = devices.infds;
+ int num;
+ num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
+ /* Could get interrupted */
+ if (num < 0)
+ continue;
/* If nothing is ready, we're done. */
- if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
+ if (num == 0)
break;
/* Otherwise, call the device(s) which have readable file
* descriptors and a method of handling them. */
for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
- int dev_fd;
if (i->handle_input(fd, i))
continue;
* buffers to deliver into. Console also uses
* it when it discovers that stdin is closed. */
FD_CLR(i->fd, &devices.infds);
- /* Tell waker to ignore it too, by sending a
- * negative fd number (-1, since 0 is a valid
- * FD number). */
- dev_fd = -i->fd - 1;
- write(waker_fd, &dev_fd, sizeof(dev_fd));
}
}
+
+ /* Is this the timeout fd? */
+ if (FD_ISSET(timeoutpipe[0], &fds))
+ handle_timeout(fd);
}
}
/* Each device descriptor is followed by the description of its virtqueues. We
* specify how many descriptors the virtqueue is to have. */
static void add_virtqueue(struct device *dev, unsigned int num_descs,
- void (*handle_output)(int fd, struct virtqueue *me))
+ void (*handle_output)(int, struct virtqueue *, bool))
{
unsigned int pages;
struct virtqueue **i, *vq = malloc(sizeof(*vq));
vq->last_avail_idx = 0;
vq->dev = dev;
vq->inflight = 0;
+ vq->blocked = false;
/* Initialize the configuration. */
vq->config.num = num_descs;
}
/*:*/
+static void timeout_alarm(int sig)
+{
+ write(timeoutpipe[1], "", 1);
+}
+
+static void setup_timeout(void)
+{
+ if (pipe(timeoutpipe) != 0)
+ err(1, "Creating timeout pipe");
+
+ if (fcntl(timeoutpipe[1], F_SETFL,
+ fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
+ err(1, "Making timeout pipe nonblocking");
+
+ add_device_fd(timeoutpipe[0]);
+ signal(SIGALRM, timeout_alarm);
+}
+
/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
* --sharenet=<name> option which opens or creates a named pipe. This can be
* used to send packets to another guest in a 1:1 manner.
err(1, "Bringing interface %s up", tapif);
}
-static void get_mac(int fd, const char *tapif, unsigned char hwaddr[6])
-{
- struct ifreq ifr;
-
- memset(&ifr, 0, sizeof(ifr));
- strcpy(ifr.ifr_name, tapif);
-
- /* SIOC stands for Socket I/O Control. G means Get (vs S for Set
- * above). IF means Interface, and HWADDR is hardware address.
- * Simple! */
- if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
- err(1, "getting hw address for %s", tapif);
- memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
-}
-
static int get_tun_device(char tapif[IFNAMSIZ])
{
struct ifreq ifr;
* the truth, I completely blundered my way through this code, but it
* works now! */
netfd = open_or_die("/dev/net/tun", O_RDWR);
- ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
strcpy(ifr.ifr_name, "tap%d");
if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
err(1, "configuring /dev/net/tun");
+ if (ioctl(netfd, TUNSETOFFLOAD,
+ TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
+ err(1, "Could not set features for tun device");
+
/* We don't need checksums calculated for packets coming in this
* device: trust us! */
ioctl(netfd, TUNSETNOCSUM, 1);
/* Network devices need a receive and a send queue, just like
* console. */
- add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
/* We need a socket to perform the magic network ioctls to bring up the
p = strchr(arg, ':');
if (p) {
str2mac(p+1, conf.mac);
+ add_feature(dev, VIRTIO_NET_F_MAC);
*p = '\0';
- } else {
- p = arg + strlen(arg);
- /* None supplied; query the randomly assigned mac. */
- get_mac(ipfd, tapif, conf.mac);
}
/* arg is now either an IP address or a bridge name */
/* Set up the tun device. */
configure_device(ipfd, tapif, ip);
- /* Tell Guest what MAC address to use. */
- add_feature(dev, VIRTIO_NET_F_MAC);
add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
+ /* Expect Guest to handle everything except UFO */
+ add_feature(dev, VIRTIO_NET_F_CSUM);
+ add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_HOST_ECN);
set_config(dev, sizeof(conf), &conf);
/* We don't need the socket any more; setup is done. */
}
/* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(int fd, struct virtqueue *vq)
+static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
{
struct vblk_info *vblk = vq->dev->priv;
char c = 0;
{
unsigned int i;
- /* Closing pipes causes the Waker thread and io_threads to die, and
- * closing /dev/lguest cleans up the Guest. Since we don't track all
- * open fds, we simply close everything beyond stderr. */
+ /* Since we don't track all open fds, we simply close everything beyond
+ * stderr. */
for (i = 3; i < FD_SETSIZE; i++)
close(i);
+
+ /* The exec automatically gets rid of the I/O and Waker threads. */
execv(main_args[0], main_args);
err(1, "Could not exec %s", main_args[0]);
}
/* ERESTART means that we need to reboot the guest */
} else if (errno == ERESTART) {
restart_guest();
- /* EAGAIN means the Waker wanted us to look at some input.
+ /* EAGAIN means a signal (timeout).
* Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, "Running guest failed");
/* We always have a console device */
setup_console();
+ /* We can timeout waiting for Guest network transmit. */
+ setup_timeout();
+
/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
* /dev/lguest file descriptor. */
lguest_fd = tell_kernel(pgdir, start);
- /* We fork off a child process, which wakes the Launcher whenever one
- * of the input file descriptors needs attention. We call this the
- * Waker, and we'll cover it in a moment. */
- waker_fd = setup_waker(lguest_fd);
+ /* We clone off a thread, which wakes the Launcher whenever one of the
+ * input file descriptors needs attention. We call this the Waker, and
+ * we'll cover it in a moment. */
+ setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */
run_guest(lguest_fd);