summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS13
-rw-r--r--Makefile2
-rw-r--r--arch/arm64/configs/cuttlefish_defconfig4
-rw-r--r--arch/arm64/configs/msm-auto-perf_defconfig1
-rw-r--r--arch/arm64/configs/msm-auto_defconfig1
-rw-r--r--arch/x86/configs/x86_64_cuttlefish_defconfig4
-rw-r--r--crypto/cts.c8
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/acpi/power.c22
-rw-r--r--drivers/gpu/drm/virtio/virtgpu_kms.c2
-rw-r--r--drivers/i2c/i2c-dev.c6
-rw-r--r--drivers/md/Kconfig16
-rw-r--r--drivers/md/dm-verity-target.c9
-rw-r--r--drivers/misc/mic/card/mic_virtio.c2
-rw-r--r--drivers/net/caif/Kconfig2
-rw-r--r--drivers/pci/host/pcie-altera.c201
-rw-r--r--drivers/remoteproc/remoteproc_virtio.c2
-rw-r--r--drivers/rpmsg/virtio_rpmsg_bus.c2
-rw-r--r--drivers/s390/virtio/kvm_virtio.c2
-rw-r--r--drivers/s390/virtio/virtio_ccw.c2
-rw-r--r--drivers/usb/class/cdc-acm.c7
-rw-r--r--drivers/usb/core/quirks.c3
-rw-r--r--drivers/usb/storage/scsiglue.c8
-rw-r--r--drivers/usb/storage/unusual_devs.h12
-rw-r--r--drivers/vhost/Kconfig18
-rw-r--r--drivers/vhost/Kconfig.vringh5
-rw-r--r--drivers/vhost/Makefile4
-rw-r--r--drivers/vhost/net.c214
-rw-r--r--drivers/vhost/scsi.c2
-rw-r--r--drivers/vhost/test.c2
-rw-r--r--drivers/vhost/vhost.c1021
-rw-r--r--drivers/vhost/vhost.h69
-rw-r--r--drivers/vhost/vsock.c797
-rw-r--r--drivers/virtio/Kconfig2
-rw-r--r--drivers/virtio/virtio_balloon.c2
-rw-r--r--drivers/virtio/virtio_input.c2
-rw-r--r--drivers/virtio/virtio_mmio.c2
-rw-r--r--drivers/virtio/virtio_pci_common.c4
-rw-r--r--drivers/virtio/virtio_pci_common.h2
-rw-r--r--drivers/virtio/virtio_pci_modern.c2
-rw-r--r--drivers/virtio/virtio_ring.c249
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/ctree.c14
-rw-r--r--fs/btrfs/ctree.h141
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c80
-rw-r--r--fs/btrfs/extent-tree.c112
-rw-r--r--fs/btrfs/extent_io.c43
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/struct-funcs.c9
-rw-r--r--fs/btrfs/tree-checker.c649
-rw-r--r--fs/btrfs/tree-checker.h38
-rw-r--r--fs/btrfs/volumes.c139
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/f2fs/data.c45
-rw-r--r--include/linux/hashtable.h4
-rw-r--r--include/linux/sunrpc/svc.h5
-rw-r--r--include/linux/virtio_config.h15
-rw-r--r--include/linux/virtio_vsock.h157
-rw-r--r--include/net/af_vsock.h9
-rw-r--r--include/trace/events/vsock_virtio_transport_common.h144
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/vhost.h39
-rw-r--r--include/uapi/linux/virtio_config.h10
-rw-r--r--include/uapi/linux/virtio_ids.h1
-rw-r--r--include/uapi/linux/virtio_vsock.h94
-rw-r--r--mm/slab.c6
-rw-r--r--net/core/dev.c2
-rw-r--r--net/sunrpc/svc.c10
-rw-r--r--net/sunrpc/svc_xprt.c5
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/vmw_vsock/Kconfig20
-rw-r--r--net/vmw_vsock/Makefile6
-rw-r--r--net/vmw_vsock/af_vsock.c57
-rw-r--r--net/vmw_vsock/virtio_transport.c620
-rw-r--r--net/vmw_vsock/virtio_transport_common.c999
-rw-r--r--net/vmw_vsock/vmci_transport.c11
-rw-r--r--sound/pci/hda/patch_realtek.c16
-rw-r--r--tools/virtio/linux/dma-mapping.h17
86 files changed, 5686 insertions, 622 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 13cc0bc83c1c..33b502bf5909 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11469,6 +11469,19 @@ S: Maintained
F: drivers/media/v4l2-core/videobuf2-*
F: include/media/videobuf2-*
+VIRTIO AND VHOST VSOCK DRIVER
+M: Stefan Hajnoczi <stefanha@redhat.com>
+L: kvm@vger.kernel.org
+L: virtualization@lists.linux-foundation.org
+L: netdev@vger.kernel.org
+S: Maintained
+F: include/linux/virtio_vsock.h
+F: include/uapi/linux/virtio_vsock.h
+F: net/vmw_vsock/virtio_transport_common.c
+F: net/vmw_vsock/virtio_transport.c
+F: drivers/vhost/vsock.c
+F: drivers/vhost/vsock.h
+
VIRTUAL SERIO DEVICE DRIVER
M: Stephen Chandler Paul <thatslyude@gmail.com>
S: Maintained
diff --git a/Makefile b/Makefile
index a1fb0dc36a64..e205b0519078 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 4
-SUBLEVEL = 170
+SUBLEVEL = 171
EXTRAVERSION =
NAME = Blurry Fish Butt
diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index 2dacb5ca76df..7457781a6f4f 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -179,6 +179,8 @@ CONFIG_NET_CLS_U32=y
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_U32=y
CONFIG_NET_CLS_ACT=y
+CONFIG_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS=y
CONFIG_CFG80211=y
# CONFIG_CFG80211_DEFAULT_PS is not set
CONFIG_MAC80211=y
@@ -385,6 +387,8 @@ CONFIG_SDCARD_FS=y
CONFIG_PSTORE=y
CONFIG_PSTORE_CONSOLE=y
CONFIG_PSTORE_RAM=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_VHOST_VSOCK=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arm64/configs/msm-auto-perf_defconfig b/arch/arm64/configs/msm-auto-perf_defconfig
index c5714c4674a7..6f282aa1044e 100644
--- a/arch/arm64/configs/msm-auto-perf_defconfig
+++ b/arch/arm64/configs/msm-auto-perf_defconfig
@@ -276,7 +276,6 @@ CONFIG_DM_CRYPT=y
CONFIG_DM_REQ_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1
CONFIG_DM_ANDROID_VERITY=y
CONFIG_NETDEVICES=y
CONFIG_BONDING=y
diff --git a/arch/arm64/configs/msm-auto_defconfig b/arch/arm64/configs/msm-auto_defconfig
index d84126bf2cfa..07a636b76d6e 100644
--- a/arch/arm64/configs/msm-auto_defconfig
+++ b/arch/arm64/configs/msm-auto_defconfig
@@ -277,7 +277,6 @@ CONFIG_DM_CRYPT=y
CONFIG_DM_REQ_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1
CONFIG_DM_ANDROID_VERITY=y
CONFIG_NETDEVICES=y
CONFIG_BONDING=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 975d083b2317..7e83b1f6b015 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -191,6 +191,8 @@ CONFIG_NET_CLS_U32=y
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_U32=y
CONFIG_NET_CLS_ACT=y
+CONFIG_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS=y
CONFIG_CFG80211=y
CONFIG_MAC80211=y
CONFIG_RFKILL=y
@@ -223,7 +225,6 @@ CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1
CONFIG_DM_VERITY_FEC=y
CONFIG_DM_ANDROID_VERITY=y
CONFIG_NETDEVICES=y
@@ -460,3 +461,4 @@ CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
CONFIG_X509_CERTIFICATE_PARSER=y
CONFIG_SYSTEM_TRUSTED_KEYRING=y
CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509"
+CONFIG_VHOST_VSOCK=y
diff --git a/crypto/cts.c b/crypto/cts.c
index e467ec0acf9f..e65688d6a4ca 100644
--- a/crypto/cts.c
+++ b/crypto/cts.c
@@ -137,8 +137,8 @@ static int crypto_cts_encrypt(struct blkcipher_desc *desc,
lcldesc.info = desc->info;
lcldesc.flags = desc->flags;
- if (tot_blocks == 1) {
- err = crypto_blkcipher_encrypt_iv(&lcldesc, dst, src, bsize);
+ if (tot_blocks <= 1) {
+ err = crypto_blkcipher_encrypt_iv(&lcldesc, dst, src, nbytes);
} else if (nbytes <= bsize * 2) {
err = cts_cbc_encrypt(ctx, desc, dst, src, 0, nbytes);
} else {
@@ -232,8 +232,8 @@ static int crypto_cts_decrypt(struct blkcipher_desc *desc,
lcldesc.info = desc->info;
lcldesc.flags = desc->flags;
- if (tot_blocks == 1) {
- err = crypto_blkcipher_decrypt_iv(&lcldesc, dst, src, bsize);
+ if (tot_blocks <= 1) {
+ err = crypto_blkcipher_decrypt_iv(&lcldesc, dst, src, nbytes);
} else if (nbytes <= bsize * 2) {
err = cts_cbc_decrypt(ctx, desc, dst, src, 0, nbytes);
} else {
diff --git a/drivers/Makefile b/drivers/Makefile
index d563f5c13544..d3f690ab5b27 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/
obj-$(CONFIG_VHOST_RING) += vhost/
+obj-$(CONFIG_VHOST) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/
obj-y += platform/
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index 1c2b846c5776..f28b4949cb9d 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -131,6 +131,23 @@ void acpi_power_resources_list_free(struct list_head *list)
}
}
+static bool acpi_power_resource_is_dup(union acpi_object *package,
+ unsigned int start, unsigned int i)
+{
+ acpi_handle rhandle, dup;
+ unsigned int j;
+
+ /* The caller is expected to check the package element types */
+ rhandle = package->package.elements[i].reference.handle;
+ for (j = start; j < i; j++) {
+ dup = package->package.elements[j].reference.handle;
+ if (dup == rhandle)
+ return true;
+ }
+
+ return false;
+}
+
int acpi_extract_power_resources(union acpi_object *package, unsigned int start,
struct list_head *list)
{
@@ -150,6 +167,11 @@ int acpi_extract_power_resources(union acpi_object *package, unsigned int start,
err = -ENODEV;
break;
}
+
+ /* Some ACPI tables contain duplicate power resource references */
+ if (acpi_power_resource_is_dup(package, start, i))
+ continue;
+
err = acpi_add_power_resource(rhandle);
if (err)
break;
diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 06496a128162..4150873d432e 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -130,7 +130,7 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
static vq_callback_t *callbacks[] = {
virtio_gpu_ctrl_ack, virtio_gpu_cursor_ack
};
- static const char *names[] = { "control", "cursor" };
+ static const char * const names[] = { "control", "cursor" };
struct virtio_gpu_device *vgdev;
/* this will expand later */
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 94c837046786..57e3790c87b1 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -459,9 +459,15 @@ static long i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return i2cdev_ioctl_smbus(client, arg);
case I2C_RETRIES:
+ if (arg > INT_MAX)
+ return -EINVAL;
+
client->adapter->retries = arg;
break;
case I2C_TIMEOUT:
+ if (arg > INT_MAX)
+ return -EINVAL;
+
/* For historical reasons, user-space sets the timeout
* value in units of 10 ms.
*/
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 0ad06670fa99..7bf1cdb582f5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -476,21 +476,6 @@ config DM_VERITY
If unsure, say N.
-config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
- bool "Prefetch size 128"
-
-config DM_VERITY_HASH_PREFETCH_MIN_SIZE
- int "Verity hash prefetch minimum size"
- depends on DM_VERITY
- range 1 4096
- default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
- default 1
- ---help---
- This sets minimum number of hash blocks to prefetch for dm-verity.
- For devices like eMMC, having larger prefetch size like 128 can improve
- performance with increased memory consumption for keeping more hashes
- in RAM.
-
config DM_VERITY_FEC
bool "Verity forward error correction support"
depends on DM_VERITY
@@ -554,7 +539,6 @@ config DM_ANDROID_VERITY
depends on ASYMMETRIC_KEY_TYPE
depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
depends on MD_LINEAR=y
- select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
---help---
This device-mapper target is virtually a VERITY target. This
target is setup by reading the metadata contents piggybacked
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index d2e3abc182b3..131077aabd08 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -529,7 +529,6 @@ static void verity_prefetch_io(struct work_struct *work)
container_of(work, struct dm_verity_prefetch_work, work);
struct dm_verity *v = pw->v;
int i;
- sector_t prefetch_size;
for (i = v->levels - 2; i >= 0; i--) {
sector_t hash_block_start;
@@ -552,14 +551,8 @@ static void verity_prefetch_io(struct work_struct *work)
hash_block_end = v->hash_blocks - 1;
}
no_prefetch_cluster:
- // for emmc, it is more efficient to send bigger read
- prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
- hash_block_end - hash_block_start + 1);
- if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
- prefetch_size = hash_block_end - hash_block_start + 1;
- }
dm_bufio_prefetch(v->bufio, hash_block_start,
- prefetch_size);
+ hash_block_end - hash_block_start + 1);
}
kfree(pw);
diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c
index e486a0c26267..f6ed57d3125c 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -311,7 +311,7 @@ unmap:
static int mic_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct mic_vdev *mvdev = to_micvdev(vdev);
struct mic_device_ctrl __iomem *dc = mvdev->dc;
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index 547098086773..f81df91a9ce1 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -52,5 +52,5 @@ config CAIF_VIRTIO
The caif driver for CAIF over Virtio.
if CAIF_VIRTIO
-source "drivers/vhost/Kconfig"
+source "drivers/vhost/Kconfig.vringh"
endif
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index 99da549d5d06..0118287a8a10 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -40,8 +40,10 @@
#define P2A_INT_ENABLE 0x3070
#define P2A_INT_ENA_ALL 0xf
#define RP_LTSSM 0x3c64
+#define RP_LTSSM_MASK 0x1f
#define LTSSM_L0 0xf
+#define PCIE_CAP_OFFSET 0x80
/* TLP configuration type 0 and 1 */
#define TLP_FMTTYPE_CFGRD0 0x04 /* Configuration Read Type 0 */
#define TLP_FMTTYPE_CFGWR0 0x44 /* Configuration Write Type 0 */
@@ -60,6 +62,9 @@
#define TLP_LOOP 500
#define RP_DEVFN 0
+#define LINK_UP_TIMEOUT HZ
+#define LINK_RETRAIN_TIMEOUT HZ
+
#define INTX_NUM 4
#define DWORD_MASK 3
@@ -80,25 +85,21 @@ struct tlp_rp_regpair_t {
u32 reg1;
};
-static void altera_pcie_retrain(struct pci_dev *dev)
+static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
+ const u32 reg)
{
- u16 linkcap, linkstat;
-
- /*
- * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
- * current speed is 2.5 GB/s.
- */
- pcie_capability_read_word(dev, PCI_EXP_LNKCAP, &linkcap);
+ writel_relaxed(value, pcie->cra_base + reg);
+}
- if ((linkcap & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
- return;
+static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
+{
+ return readl_relaxed(pcie->cra_base + reg);
+}
- pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat);
- if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB)
- pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
- PCI_EXP_LNKCTL_RL);
+static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
+{
+ return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
}
-DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);
/*
* Altera PCIe port uses BAR0 of RC's configuration space as the translation
@@ -119,17 +120,6 @@ static bool altera_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int devfn,
return false;
}
-static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
- const u32 reg)
-{
- writel_relaxed(value, pcie->cra_base + reg);
-}
-
-static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
-{
- return readl_relaxed(pcie->cra_base + reg);
-}
-
static void tlp_write_tx(struct altera_pcie *pcie,
struct tlp_rp_regpair_t *tlp_rp_regdata)
{
@@ -138,11 +128,6 @@ static void tlp_write_tx(struct altera_pcie *pcie,
cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
}
-static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
-{
- return !!(cra_readl(pcie, RP_LTSSM) & LTSSM_L0);
-}
-
static bool altera_pcie_valid_config(struct altera_pcie *pcie,
struct pci_bus *bus, int dev)
{
@@ -286,22 +271,14 @@ static int tlp_cfg_dword_write(struct altera_pcie *pcie, u8 bus, u32 devfn,
return PCIBIOS_SUCCESSFUL;
}
-static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
- int where, int size, u32 *value)
+static int _altera_pcie_cfg_read(struct altera_pcie *pcie, u8 busno,
+ unsigned int devfn, int where, int size,
+ u32 *value)
{
- struct altera_pcie *pcie = bus->sysdata;
int ret;
u32 data;
u8 byte_en;
- if (altera_pcie_hide_rc_bar(bus, devfn, where))
- return PCIBIOS_BAD_REGISTER_NUMBER;
-
- if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
- *value = 0xffffffff;
- return PCIBIOS_DEVICE_NOT_FOUND;
- }
-
switch (size) {
case 1:
byte_en = 1 << (where & 3);
@@ -314,7 +291,7 @@ static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
break;
}
- ret = tlp_cfg_dword_read(pcie, bus->number, devfn,
+ ret = tlp_cfg_dword_read(pcie, busno, devfn,
(where & ~DWORD_MASK), byte_en, &data);
if (ret != PCIBIOS_SUCCESSFUL)
return ret;
@@ -334,20 +311,14 @@ static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
return PCIBIOS_SUCCESSFUL;
}
-static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
- int where, int size, u32 value)
+static int _altera_pcie_cfg_write(struct altera_pcie *pcie, u8 busno,
+ unsigned int devfn, int where, int size,
+ u32 value)
{
- struct altera_pcie *pcie = bus->sysdata;
u32 data32;
u32 shift = 8 * (where & 3);
u8 byte_en;
- if (altera_pcie_hide_rc_bar(bus, devfn, where))
- return PCIBIOS_BAD_REGISTER_NUMBER;
-
- if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
- return PCIBIOS_DEVICE_NOT_FOUND;
-
switch (size) {
case 1:
data32 = (value & 0xff) << shift;
@@ -363,8 +334,40 @@ static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
break;
}
- return tlp_cfg_dword_write(pcie, bus->number, devfn,
- (where & ~DWORD_MASK), byte_en, data32);
+ return tlp_cfg_dword_write(pcie, busno, devfn, (where & ~DWORD_MASK),
+ byte_en, data32);
+}
+
+static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 *value)
+{
+ struct altera_pcie *pcie = bus->sysdata;
+
+ if (altera_pcie_hide_rc_bar(bus, devfn, where))
+ return PCIBIOS_BAD_REGISTER_NUMBER;
+
+ if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
+ *value = 0xffffffff;
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ }
+
+ return _altera_pcie_cfg_read(pcie, bus->number, devfn, where, size,
+ value);
+}
+
+static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 value)
+{
+ struct altera_pcie *pcie = bus->sysdata;
+
+ if (altera_pcie_hide_rc_bar(bus, devfn, where))
+ return PCIBIOS_BAD_REGISTER_NUMBER;
+
+ if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ return _altera_pcie_cfg_write(pcie, bus->number, devfn, where, size,
+ value);
}
static struct pci_ops altera_pcie_ops = {
@@ -372,6 +375,90 @@ static struct pci_ops altera_pcie_ops = {
.write = altera_pcie_cfg_write,
};
+static int altera_read_cap_word(struct altera_pcie *pcie, u8 busno,
+ unsigned int devfn, int offset, u16 *value)
+{
+ u32 data;
+ int ret;
+
+ ret = _altera_pcie_cfg_read(pcie, busno, devfn,
+ PCIE_CAP_OFFSET + offset, sizeof(*value),
+ &data);
+ *value = data;
+ return ret;
+}
+
+static int altera_write_cap_word(struct altera_pcie *pcie, u8 busno,
+ unsigned int devfn, int offset, u16 value)
+{
+ return _altera_pcie_cfg_write(pcie, busno, devfn,
+ PCIE_CAP_OFFSET + offset, sizeof(value),
+ value);
+}
+
+static void altera_wait_link_retrain(struct altera_pcie *pcie)
+{
+ u16 reg16;
+ unsigned long start_jiffies;
+
+ /* Wait for link training end. */
+ start_jiffies = jiffies;
+ for (;;) {
+ altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+ PCI_EXP_LNKSTA, &reg16);
+ if (!(reg16 & PCI_EXP_LNKSTA_LT))
+ break;
+
+ if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
+ dev_err(&pcie->pdev->dev, "link retrain timeout\n");
+ break;
+ }
+ udelay(100);
+ }
+
+ /* Wait for link is up */
+ start_jiffies = jiffies;
+ for (;;) {
+ if (altera_pcie_link_is_up(pcie))
+ break;
+
+ if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
+ dev_err(&pcie->pdev->dev, "link up timeout\n");
+ break;
+ }
+ udelay(100);
+ }
+}
+
+static void altera_pcie_retrain(struct altera_pcie *pcie)
+{
+ u16 linkcap, linkstat, linkctl;
+
+ if (!altera_pcie_link_is_up(pcie))
+ return;
+
+ /*
+ * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
+ * current speed is 2.5 GB/s.
+ */
+ altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN, PCI_EXP_LNKCAP,
+ &linkcap);
+ if ((linkcap & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
+ return;
+
+ altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN, PCI_EXP_LNKSTA,
+ &linkstat);
+ if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
+ altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+ PCI_EXP_LNKCTL, &linkctl);
+ linkctl |= PCI_EXP_LNKCTL_RL;
+ altera_write_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+ PCI_EXP_LNKCTL, linkctl);
+
+ altera_wait_link_retrain(pcie);
+ }
+}
+
static int altera_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
irq_hw_number_t hwirq)
{
@@ -506,6 +593,11 @@ static int altera_pcie_parse_dt(struct altera_pcie *pcie)
return 0;
}
+static void altera_pcie_host_init(struct altera_pcie *pcie)
+{
+ altera_pcie_retrain(pcie);
+}
+
static int altera_pcie_probe(struct platform_device *pdev)
{
struct altera_pcie *pcie;
@@ -543,6 +635,7 @@ static int altera_pcie_probe(struct platform_device *pdev)
cra_writel(pcie, P2A_INT_STS_ALL, P2A_INT_STATUS);
/* enable all interrupts */
cra_writel(pcie, P2A_INT_ENA_ALL, P2A_INT_ENABLE);
+ altera_pcie_host_init(pcie);
bus = pci_scan_root_bus(&pdev->dev, pcie->root_bus_nr, &altera_pcie_ops,
pcie, &pcie->resources);
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index e1a10232a943..e44872fb9e5e 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -147,7 +147,7 @@ static void rproc_virtio_del_vqs(struct virtio_device *vdev)
static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct rproc *rproc = vdev_to_rproc(vdev);
int i, ret;
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 73354ee27877..1fcd27c1f183 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -945,7 +945,7 @@ static void rpmsg_ns_cb(struct rpmsg_channel *rpdev, void *data, int len,
static int rpmsg_probe(struct virtio_device *vdev)
{
vq_callback_t *vq_cbs[] = { rpmsg_recv_done, rpmsg_xmit_done };
- const char *names[] = { "input", "output" };
+ static const char * const names[] = { "input", "output" };
struct virtqueue *vqs[2];
struct virtproc_info *vrp;
void *bufs_va;
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 53fb975c404b..1d060fd293a3 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -255,7 +255,7 @@ static void kvm_del_vqs(struct virtio_device *vdev)
static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct kvm_device *kdev = to_kvmdev(vdev);
int i;
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index ff06bdfd2b20..9e685246b98d 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -639,7 +639,7 @@ out:
static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct virtio_ccw_device *vcdev = to_vc_device(vdev);
unsigned long *indicatorp = NULL;
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 3919ea066bf9..736de1021d8b 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1885,6 +1885,13 @@ static const struct usb_device_id acm_ids[] = {
.driver_info = IGNORE_DEVICE,
},
+ { USB_DEVICE(0x1bc7, 0x0021), /* Telit 3G ACM only composition */
+ .driver_info = SEND_ZERO_PACKET,
+ },
+ { USB_DEVICE(0x1bc7, 0x0023), /* Telit 3G ACM + ECM composition */
+ .driver_info = SEND_ZERO_PACKET,
+ },
+
/* control interfaces without any protocol set */
{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
USB_CDC_PROTO_NONE) },
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index cf378b1ed373..733479ddf8a7 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -240,7 +240,8 @@ static const struct usb_device_id usb_quirk_list[] = {
USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL },
/* Corsair K70 RGB */
- { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT },
+ { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT |
+ USB_QUIRK_DELAY_CTRL_MSG },
/* Corsair Strafe */
{ USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT |
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 6c186b4df94a..b3344a77dcce 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -223,8 +223,12 @@ static int slave_configure(struct scsi_device *sdev)
if (!(us->fflags & US_FL_NEEDS_CAP16))
sdev->try_rc_10_first = 1;
- /* assume SPC3 or latter devices support sense size > 18 */
- if (sdev->scsi_level > SCSI_SPC_2)
+ /*
+ * assume SPC3 or latter devices support sense size > 18
+ * unless US_FL_BAD_SENSE quirk is specified.
+ */
+ if (sdev->scsi_level > SCSI_SPC_2 &&
+ !(us->fflags & US_FL_BAD_SENSE))
us->fflags |= US_FL_SANE_SENSE;
/* USB-IDE bridges tend to report SK = 0x04 (Non-recoverable
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 898215cad351..d92b974f0635 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1393,6 +1393,18 @@ UNUSUAL_DEV( 0x0d49, 0x7310, 0x0000, 0x9999,
US_FL_SANE_SENSE),
/*
+ * Reported by Icenowy Zheng <icenowy@aosc.io>
+ * The SMI SM3350 USB-UFS bridge controller will enter a wrong state
+ * that do not process read/write command if a long sense is requested,
+ * so force to use 18-byte sense.
+ */
+UNUSUAL_DEV( 0x090c, 0x3350, 0x0000, 0xffff,
+ "SMI",
+ "SM3350 UFS-to-USB-Mass-Storage bridge",
+ USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+ US_FL_BAD_SENSE ),
+
+/*
* Pete Zaitcev <zaitcev@yahoo.com>, bz#164688.
* The device blatantly ignores LUN and returns 1 in GetMaxLUN.
*/
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 533eaf04f12f..40764ecad9ce 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -2,7 +2,6 @@ config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
select VHOST
- select VHOST_RING
---help---
This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net
@@ -15,17 +14,24 @@ config VHOST_SCSI
tristate "VHOST_SCSI TCM fabric driver"
depends on TARGET_CORE && EVENTFD && m
select VHOST
- select VHOST_RING
default n
---help---
Say M here to enable the vhost_scsi TCM fabric module
for use with virtio-scsi guests
-config VHOST_RING
- tristate
+config VHOST_VSOCK
+ tristate "vhost virtio-vsock driver"
+ depends on VSOCKETS && EVENTFD
+ select VIRTIO_VSOCKETS_COMMON
+ select VHOST
+ default n
---help---
- This option is selected by any driver which needs to access
- the host side of a virtio ring.
+ This kernel module can be loaded in the host kernel to provide AF_VSOCK
+ sockets for communicating with guests. The guests must have the
+ virtio_transport.ko driver loaded to use the virtio-vsock device.
+
+ To compile this driver as a module, choose M here: the module will be called
+ vhost_vsock.
config VHOST
tristate
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
new file mode 100644
index 000000000000..6a4490c09d7f
--- /dev/null
+++ b/drivers/vhost/Kconfig.vringh
@@ -0,0 +1,5 @@
+config VHOST_RING
+ tristate
+ ---help---
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index e0441c34db1c..6b012b986b57 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -4,5 +4,9 @@ vhost_net-y := net.o
obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o
+obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
+vhost_vsock-y := vsock.o
+
obj-$(CONFIG_VHOST_RING) += vringh.o
+
obj-$(CONFIG_VHOST) += vhost.o
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 645b2197930e..53cf130922f3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
- (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM)
};
enum {
@@ -287,6 +288,69 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
rcu_read_unlock_bh();
}
+static inline unsigned long busy_clock(void)
+{
+ return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+ unsigned long endtime)
+{
+ return likely(!need_resched()) &&
+ likely(!time_after(busy_clock(), endtime)) &&
+ likely(!signal_pending(current)) &&
+ !vhost_has_work(dev);
+}
+
+static void vhost_net_disable_vq(struct vhost_net *n,
+ struct vhost_virtqueue *vq)
+{
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
+ struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+ if (!vq->private_data)
+ return;
+ vhost_poll_stop(poll);
+}
+
+static int vhost_net_enable_vq(struct vhost_net *n,
+ struct vhost_virtqueue *vq)
+{
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
+ struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+ struct socket *sock;
+
+ sock = vq->private_data;
+ if (!sock)
+ return 0;
+
+ return vhost_poll_start(poll, sock->file);
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num)
+{
+ unsigned long uninitialized_var(endtime);
+ int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ out_num, in_num, NULL, NULL);
+
+ if (r == vq->num && vq->busyloop_timeout) {
+ preempt_disable();
+ endtime = busy_clock() + vq->busyloop_timeout;
+ while (vhost_can_busy_poll(vq->dev, endtime) &&
+ vhost_vq_avail_empty(vq->dev, vq))
+ cpu_relax_lowlatency();
+ preempt_enable();
+ r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ out_num, in_num, NULL, NULL);
+ }
+
+ return r;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -314,6 +378,9 @@ static void handle_tx(struct vhost_net *net)
if (!sock)
goto out;
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
vhost_disable_notify(&net->dev, vq);
hdr_size = nvq->vhost_hlen;
@@ -331,10 +398,9 @@ static void handle_tx(struct vhost_net *net)
% UIO_MAXIOV == nvq->done_idx))
break;
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in,
- NULL, NULL);
+ head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -435,6 +501,43 @@ static int peek_head_len(struct sock *sk)
return len;
}
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
+ unsigned long uninitialized_var(endtime);
+ int len = peek_head_len(sk);
+
+ if (!len && vq->busyloop_timeout) {
+ /* Both tx vq and rx socket were polled here */
+ mutex_lock_nested(&vq->mutex, 1);
+ vhost_disable_notify(&net->dev, vq);
+
+ preempt_disable();
+ endtime = busy_clock() + vq->busyloop_timeout;
+
+ while (vhost_can_busy_poll(&net->dev, endtime) &&
+ skb_queue_empty(&sk->sk_receive_queue) &&
+ vhost_vq_avail_empty(&net->dev, vq))
+ cpu_relax_lowlatency();
+
+ preempt_enable();
+
+ if (!vhost_vq_avail_empty(&net->dev, vq))
+ vhost_poll_queue(&vq->poll);
+ else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+
+ mutex_unlock(&vq->mutex);
+
+ len = peek_head_len(sk);
+ }
+
+ return len;
+}
+
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
* @vq - the relevant virtqueue
@@ -540,11 +643,16 @@ static void handle_rx(struct vhost_net *net)
struct iov_iter fixup;
__virtio16 num_buffers;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, 0);
sock = vq->private_data;
if (!sock)
goto out;
+
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
vhost_disable_notify(&net->dev, vq);
+ vhost_net_disable_vq(net, vq);
vhost_hlen = nvq->vhost_hlen;
sock_hlen = nvq->sock_hlen;
@@ -553,7 +661,7 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
- while ((sock_len = peek_head_len(sock->sk))) {
+ while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len,
@@ -561,7 +669,7 @@ static void handle_rx(struct vhost_net *net)
likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
- break;
+ goto out;
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
@@ -580,7 +688,7 @@ static void handle_rx(struct vhost_net *net)
}
/* Nothing new? Wait for eventfd to tell us
* they refilled. */
- break;
+ goto out;
}
/* We don't need to be notified again. */
iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
@@ -608,7 +716,7 @@ static void handle_rx(struct vhost_net *net)
&fixup) != sizeof(hdr)) {
vq_err(vq, "Unable to write vnet_hdr "
"at addr %p\n", vq->iov->iov_base);
- break;
+ goto out;
}
} else {
/* Header came from socket; we'll need to patch
@@ -624,7 +732,7 @@ static void handle_rx(struct vhost_net *net)
&fixup) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
vhost_discard_vq_desc(vq, headcount);
- break;
+ goto out;
}
vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
headcount);
@@ -633,9 +741,10 @@ static void handle_rx(struct vhost_net *net)
total_len += vhost_len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
- break;
+ goto out;
}
}
+ vhost_net_enable_vq(net, vq);
out:
mutex_unlock(&vq->mutex);
}
@@ -714,32 +823,6 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return 0;
}
-static void vhost_net_disable_vq(struct vhost_net *n,
- struct vhost_virtqueue *vq)
-{
- struct vhost_net_virtqueue *nvq =
- container_of(vq, struct vhost_net_virtqueue, vq);
- struct vhost_poll *poll = n->poll + (nvq - n->vqs);
- if (!vq->private_data)
- return;
- vhost_poll_stop(poll);
-}
-
-static int vhost_net_enable_vq(struct vhost_net *n,
- struct vhost_virtqueue *vq)
-{
- struct vhost_net_virtqueue *nvq =
- container_of(vq, struct vhost_net_virtqueue, vq);
- struct vhost_poll *poll = n->poll + (nvq - n->vqs);
- struct socket *sock;
-
- sock = vq->private_data;
- if (!sock)
- return 0;
-
- return vhost_poll_start(poll, sock->file);
-}
-
static struct socket *vhost_net_stop_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
@@ -917,7 +1000,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
vq->private_data = sock;
- r = vhost_init_used(vq);
+ r = vhost_vq_init_access(vq);
if (r)
goto err_used;
r = vhost_net_enable_vq(n, vq);
@@ -969,21 +1052,21 @@ static long vhost_net_reset_owner(struct vhost_net *n)
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
- struct vhost_memory *memory;
+ struct vhost_umem *umem;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
if (err)
goto done;
- memory = vhost_dev_reset_owner_prepare();
- if (!memory) {
+ umem = vhost_dev_reset_owner_prepare();
+ if (!umem) {
err = -ENOMEM;
goto done;
}
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
vhost_dev_stop(&n->dev);
- vhost_dev_reset_owner(&n->dev, memory);
+ vhost_dev_reset_owner(&n->dev, umem);
vhost_net_vq_reset(n);
done:
mutex_unlock(&n->dev.mutex);
@@ -1014,10 +1097,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_lock(&n->dev.mutex);
if ((features & (1 << VHOST_F_LOG_ALL)) &&
- !vhost_log_access_ok(&n->dev)) {
- mutex_unlock(&n->dev.mutex);
- return -EFAULT;
+ !vhost_log_access_ok(&n->dev))
+ goto out_unlock;
+
+ if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+ if (vhost_init_device_iotlb(&n->dev, true))
+ goto out_unlock;
}
+
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
mutex_lock(&n->vqs[i].vq.mutex);
n->vqs[i].vq.acked_features = features;
@@ -1027,6 +1114,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_unlock(&n->dev.mutex);
return 0;
+
+out_unlock:
+ mutex_unlock(&n->dev.mutex);
+ return -EFAULT;
}
static long vhost_net_set_owner(struct vhost_net *n)
@@ -1100,9 +1191,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
}
#endif
+static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+ int noblock = file->f_flags & O_NONBLOCK;
+
+ return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+
+ return vhost_chr_write_iter(dev, from);
+}
+
+static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
+{
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+
+ return vhost_chr_poll(file, dev, wait);
+}
+
static const struct file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
+ .read_iter = vhost_net_chr_read_iter,
+ .write_iter = vhost_net_chr_write_iter,
+ .poll = vhost_net_chr_poll,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 8fc62a03637a..009315f006bf 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1277,7 +1277,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
vq = &vs->vqs[i].vq;
mutex_lock(&vq->mutex);
vq->private_data = vs_tpg;
- vhost_init_used(vq);
+ vhost_vq_init_access(vq);
mutex_unlock(&vq->mutex);
}
ret = 0;
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index f2882ac98726..388eec4e1a90 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -196,7 +196,7 @@ static long vhost_test_run(struct vhost_test *n, int test)
oldpriv = vq->private_data;
vq->private_data = priv;
- r = vhost_init_used(&n->vqs[index]);
+ r = vhost_vq_init_access(&n->vqs[index]);
mutex_unlock(&vq->mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ed0a356d1d3..53b1b3cfce84 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sort.h>
+#include <linux/interval_tree_generic.h>
#include <linux/nospec.h>
#include "vhost.h"
@@ -35,6 +36,10 @@ static ushort max_mem_regions = 64;
module_param(max_mem_regions, ushort, 0444);
MODULE_PARM_DESC(max_mem_regions,
"Maximum number of memory regions in memory map. (default: 64)");
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+ "Maximum number of iotlb entries. (default: 2048)");
enum {
VHOST_MEMORY_F_LOG = 0x1,
@@ -43,12 +48,26 @@ enum {
#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
+INTERVAL_TREE_DEFINE(struct vhost_umem_node,
+ rb, __u64, __subtree_last,
+ START, LAST, , vhost_umem_interval_tree);
+
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
vq->user_be = !virtio_legacy_is_little_endian();
}
+static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
+{
+ vq->user_be = true;
+}
+
+static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
+{
+ vq->user_be = false;
+}
+
static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
struct vhost_vring_state s;
@@ -63,7 +82,10 @@ static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
s.num != VHOST_VRING_BIG_ENDIAN)
return -EINVAL;
- vq->user_be = s.num;
+ if (s.num == VHOST_VRING_BIG_ENDIAN)
+ vhost_enable_cross_endian_big(vq);
+ else
+ vhost_enable_cross_endian_little(vq);
return 0;
}
@@ -92,7 +114,7 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
}
#else
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
}
@@ -109,11 +131,29 @@ static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
static void vhost_init_is_le(struct vhost_virtqueue *vq)
{
- if (vhost_has_feature(vq, VIRTIO_F_VERSION_1))
- vq->is_le = true;
+ vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
+ || virtio_legacy_is_little_endian();
}
#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
+static void vhost_reset_is_le(struct vhost_virtqueue *vq)
+{
+ vhost_init_is_le(vq);
+}
+
+struct vhost_flush_struct {
+ struct vhost_work work;
+ struct completion wait_event;
+};
+
+static void vhost_flush_work(struct vhost_work *work)
+{
+ struct vhost_flush_struct *s;
+
+ s = container_of(work, struct vhost_flush_struct, work);
+ complete(&s->wait_event);
+}
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -138,11 +178,9 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
{
- INIT_LIST_HEAD(&work->node);
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
work->fn = fn;
init_waitqueue_head(&work->done);
- work->flushing = 0;
- work->queue_seq = work->done_seq = 0;
}
EXPORT_SYMBOL_GPL(vhost_work_init);
@@ -193,31 +231,17 @@ void vhost_poll_stop(struct vhost_poll *poll)
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
- unsigned seq)
-{
- int left;
-
- spin_lock_irq(&dev->work_lock);
- left = seq - work->done_seq;
- spin_unlock_irq(&dev->work_lock);
- return left <= 0;
-}
-
void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
{
- unsigned seq;
- int flushing;
+ struct vhost_flush_struct flush;
+
+ if (dev->worker) {
+ init_completion(&flush.wait_event);
+ vhost_work_init(&flush.work, vhost_flush_work);
- spin_lock_irq(&dev->work_lock);
- seq = work->queue_seq;
- work->flushing++;
- spin_unlock_irq(&dev->work_lock);
- wait_event(work->done, vhost_work_seq_done(dev, work, seq));
- spin_lock_irq(&dev->work_lock);
- flushing = --work->flushing;
- spin_unlock_irq(&dev->work_lock);
- BUG_ON(flushing < 0);
+ vhost_work_queue(dev, &flush.work);
+ wait_for_completion(&flush.wait_event);
+ }
}
EXPORT_SYMBOL_GPL(vhost_work_flush);
@@ -231,20 +255,27 @@ EXPORT_SYMBOL_GPL(vhost_poll_flush);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
{
- unsigned long flags;
+ if (!dev->worker)
+ return;
- spin_lock_irqsave(&dev->work_lock, flags);
- if (list_empty(&work->node)) {
- list_add_tail(&work->node, &dev->work_list);
- work->queue_seq++;
- spin_unlock_irqrestore(&dev->work_lock, flags);
+ if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+ /* We can only add the work to the list after we're
+ * sure it was not in the list.
+ */
+ smp_mb();
+ llist_add(&work->node, &dev->work_list);
wake_up_process(dev->worker);
- } else {
- spin_unlock_irqrestore(&dev->work_lock, flags);
}
}
EXPORT_SYMBOL_GPL(vhost_work_queue);
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
+{
+ return !llist_empty(&dev->work_list);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work);
+
void vhost_poll_queue(struct vhost_poll *poll)
{
vhost_work_queue(poll->dev, &poll->work);
@@ -275,16 +306,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
- vq->memory = NULL;
- vq->is_le = virtio_legacy_is_little_endian();
- vhost_vq_reset_user_be(vq);
+ vhost_reset_is_le(vq);
+ vhost_disable_cross_endian(vq);
+ vq->busyloop_timeout = 0;
+ vq->umem = NULL;
+ vq->iotlb = NULL;
}
static int vhost_worker(void *data)
{
struct vhost_dev *dev = data;
- struct vhost_work *work = NULL;
- unsigned uninitialized_var(seq);
+ struct vhost_work *work, *work_next;
+ struct llist_node *node;
mm_segment_t oldfs = get_fs();
set_fs(USER_DS);
@@ -294,35 +327,25 @@ static int vhost_worker(void *data)
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
- spin_lock_irq(&dev->work_lock);
- if (work) {
- work->done_seq = seq;
- if (work->flushing)
- wake_up_all(&work->done);
- }
-
if (kthread_should_stop()) {
- spin_unlock_irq(&dev->work_lock);
__set_current_state(TASK_RUNNING);
break;
}
- if (!list_empty(&dev->work_list)) {
- work = list_first_entry(&dev->work_list,
- struct vhost_work, node);
- list_del_init(&work->node);
- seq = work->queue_seq;
- } else
- work = NULL;
- spin_unlock_irq(&dev->work_lock);
- if (work) {
+ node = llist_del_all(&dev->work_list);
+ if (!node)
+ schedule();
+
+ node = llist_reverse_order(node);
+ /* make sure flag is seen after deletion */
+ smp_wmb();
+ llist_for_each_entry_safe(work, work_next, node, node) {
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
__set_current_state(TASK_RUNNING);
work->fn(work);
if (need_resched())
schedule();
- } else
- schedule();
-
+ }
}
unuse_mm(dev->mm);
set_fs(oldfs);
@@ -381,11 +404,16 @@ void vhost_dev_init(struct vhost_dev *dev,
mutex_init(&dev->mutex);
dev->log_ctx = NULL;
dev->log_file = NULL;
- dev->memory = NULL;
+ dev->umem = NULL;
+ dev->iotlb = NULL;
dev->mm = NULL;
- spin_lock_init(&dev->work_lock);
- INIT_LIST_HEAD(&dev->work_list);
dev->worker = NULL;
+ init_llist_head(&dev->work_list);
+ init_waitqueue_head(&dev->wait);
+ INIT_LIST_HEAD(&dev->read_list);
+ INIT_LIST_HEAD(&dev->pending_list);
+ spin_lock_init(&dev->iotlb_lock);
+
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -486,27 +514,36 @@ err_mm:
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void)
+static void *vhost_kvzalloc(unsigned long size)
+{
+ void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+ if (!n)
+ n = vzalloc(size);
+ return n;
+}
+
+struct vhost_umem *vhost_dev_reset_owner_prepare(void)
{
- return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+ return vhost_kvzalloc(sizeof(struct vhost_umem));
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
/* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
{
int i;
vhost_dev_cleanup(dev, true);
/* Restore memory to default empty mapping. */
- memory->nregions = 0;
- dev->memory = memory;
+ INIT_LIST_HEAD(&umem->umem_list);
+ dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
*/
for (i = 0; i < dev->nvqs; ++i)
- dev->vqs[i]->memory = memory;
+ dev->vqs[i]->umem = umem;
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
@@ -523,6 +560,47 @@ void vhost_dev_stop(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_stop);
+static void vhost_umem_free(struct vhost_umem *umem,
+ struct vhost_umem_node *node)
+{
+ vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+ list_del(&node->link);
+ kfree(node);
+ umem->numem--;
+}
+
+static void vhost_umem_clean(struct vhost_umem *umem)
+{
+ struct vhost_umem_node *node, *tmp;
+
+ if (!umem)
+ return;
+
+ list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
+ vhost_umem_free(umem, node);
+
+ kvfree(umem);
+}
+
+static void vhost_clear_msg(struct vhost_dev *dev)
+{
+ struct vhost_msg_node *node, *n;
+
+ spin_lock(&dev->iotlb_lock);
+
+ list_for_each_entry_safe(node, n, &dev->read_list, node) {
+ list_del(&node->node);
+ kfree(node);
+ }
+
+ list_for_each_entry_safe(node, n, &dev->pending_list, node) {
+ list_del(&node->node);
+ kfree(node);
+ }
+
+ spin_unlock(&dev->iotlb_lock);
+}
+
/* Caller should have device mutex if and only if locked is set */
void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
{
@@ -549,9 +627,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
fput(dev->log_file);
dev->log_file = NULL;
/* No one will access memory at this point */
- kvfree(dev->memory);
- dev->memory = NULL;
- WARN_ON(!list_empty(&dev->work_list));
+ vhost_umem_clean(dev->umem);
+ dev->umem = NULL;
+ vhost_umem_clean(dev->iotlb);
+ dev->iotlb = NULL;
+ vhost_clear_msg(dev);
+ wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+ WARN_ON(!llist_empty(&dev->work_list));
if (dev->worker) {
kthread_stop(dev->worker);
dev->worker = NULL;
@@ -575,26 +657,34 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
(sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
}
+static bool vhost_overflow(u64 uaddr, u64 size)
+{
+ /* Make sure 64 bit math will not overflow. */
+ return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;
+}
+
/* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
+static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
int log_all)
{
- int i;
+ struct vhost_umem_node *node;
- if (!mem)
+ if (!umem)
return 0;
- for (i = 0; i < mem->nregions; ++i) {
- struct vhost_memory_region *m = mem->regions + i;
- unsigned long a = m->userspace_addr;
- if (m->memory_size > ULONG_MAX)
+ list_for_each_entry(node, &umem->umem_list, link) {
+ unsigned long a = node->userspace_addr;
+
+ if (vhost_overflow(node->userspace_addr, node->size))
return 0;
- else if (!access_ok(VERIFY_WRITE, (void __user *)a,
- m->memory_size))
+
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)a,
+ node->size))
return 0;
else if (log_all && !log_access_ok(log_base,
- m->guest_phys_addr,
- m->memory_size))
+ node->start,
+ node->size))
return 0;
}
return 1;
@@ -602,7 +692,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
/* Can we switch to this memory table? */
/* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
+static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
int log_all)
{
int i;
@@ -615,7 +705,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
/* If ring is inactive, will check when it's enabled. */
if (d->vqs[i]->private_data)
- ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
+ ok = vq_memory_access_ok(d->vqs[i]->log_base,
+ umem, log);
else
ok = 1;
mutex_unlock(&d->vqs[i]->mutex);
@@ -625,12 +716,388 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
return 1;
}
+static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
+ struct iovec iov[], int iov_size, int access);
+
+static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
+ const void *from, unsigned size)
+{
+ int ret;
+
+ if (!vq->iotlb)
+ return __copy_to_user(to, from, size);
+ else {
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that all vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ struct iov_iter t;
+ ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_WO);
+ if (ret < 0)
+ goto out;
+ iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
+ ret = copy_to_iter(from, size, &t);
+ if (ret == size)
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
+static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
+ void *from, unsigned size)
+{
+ int ret;
+
+ if (!vq->iotlb)
+ return __copy_from_user(to, from, size);
+ else {
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ struct iov_iter f;
+ ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_RO);
+ if (ret < 0) {
+ vq_err(vq, "IOTLB translation failure: uaddr "
+ "%p size 0x%llx\n", from,
+ (unsigned long long) size);
+ goto out;
+ }
+ iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
+ ret = copy_from_iter(to, size, &f);
+ if (ret == size)
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
+ void *addr, unsigned size)
+{
+ int ret;
+
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_RO);
+ if (ret < 0) {
+ vq_err(vq, "IOTLB translation failure: uaddr "
+ "%p size 0x%llx\n", addr,
+ (unsigned long long) size);
+ return NULL;
+ }
+
+ if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
+ vq_err(vq, "Non atomic userspace memory access: uaddr "
+ "%p size 0x%llx\n", addr,
+ (unsigned long long) size);
+ return NULL;
+ }
+
+ return vq->iotlb_iov[0].iov_base;
+}
+
+#define vhost_put_user(vq, x, ptr) \
+({ \
+ int ret = -EFAULT; \
+ if (!vq->iotlb) { \
+ ret = __put_user(x, ptr); \
+ } else { \
+ __typeof__(ptr) to = \
+ (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+ if (to != NULL) \
+ ret = __put_user(x, to); \
+ else \
+ ret = -EFAULT; \
+ } \
+ ret; \
+})
+
+#define vhost_get_user(vq, x, ptr) \
+({ \
+ int ret; \
+ if (!vq->iotlb) { \
+ ret = __get_user(x, ptr); \
+ } else { \
+ __typeof__(ptr) from = \
+ (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+ if (from != NULL) \
+ ret = __get_user(x, from); \
+ else \
+ ret = -EFAULT; \
+ } \
+ ret; \
+})
+
+static void vhost_dev_lock_vqs(struct vhost_dev *d)
+{
+ int i = 0;
+ for (i = 0; i < d->nvqs; ++i)
+ mutex_lock_nested(&d->vqs[i]->mutex, i);
+}
+
+static void vhost_dev_unlock_vqs(struct vhost_dev *d)
+{
+ int i = 0;
+ for (i = 0; i < d->nvqs; ++i)
+ mutex_unlock(&d->vqs[i]->mutex);
+}
+
+static int vhost_new_umem_range(struct vhost_umem *umem,
+ u64 start, u64 size, u64 end,
+ u64 userspace_addr, int perm)
+{
+ struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
+
+ if (!node)
+ return -ENOMEM;
+
+ if (umem->numem == max_iotlb_entries) {
+ tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
+ vhost_umem_free(umem, tmp);
+ }
+
+ node->start = start;
+ node->size = size;
+ node->last = end;
+ node->userspace_addr = userspace_addr;
+ node->perm = perm;
+ INIT_LIST_HEAD(&node->link);
+ list_add_tail(&node->link, &umem->umem_list);
+ vhost_umem_interval_tree_insert(node, &umem->umem_tree);
+ umem->numem++;
+
+ return 0;
+}
+
+static void vhost_del_umem_range(struct vhost_umem *umem,
+ u64 start, u64 end)
+{
+ struct vhost_umem_node *node;
+
+ while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+ start, end)))
+ vhost_umem_free(umem, node);
+}
+
+static void vhost_iotlb_notify_vq(struct vhost_dev *d,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_msg_node *node, *n;
+
+ spin_lock(&d->iotlb_lock);
+
+ list_for_each_entry_safe(node, n, &d->pending_list, node) {
+ struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
+ if (msg->iova <= vq_msg->iova &&
+ msg->iova + msg->size - 1 >= vq_msg->iova &&
+ vq_msg->type == VHOST_IOTLB_MISS) {
+ vhost_poll_queue(&node->vq->poll);
+ list_del(&node->node);
+ kfree(node);
+ }
+ }
+
+ spin_unlock(&d->iotlb_lock);
+}
+
+static int umem_access_ok(u64 uaddr, u64 size, int access)
+{
+ unsigned long a = uaddr;
+
+ /* Make sure 64 bit math will not overflow. */
+ if (vhost_overflow(uaddr, size))
+ return -EFAULT;
+
+ if ((access & VHOST_ACCESS_RO) &&
+ !access_ok(VERIFY_READ, (void __user *)a, size))
+ return -EFAULT;
+ if ((access & VHOST_ACCESS_WO) &&
+ !access_ok(VERIFY_WRITE, (void __user *)a, size))
+ return -EFAULT;
+ return 0;
+}
+
+int vhost_process_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg)
+{
+ int ret = 0;
+
+ mutex_lock(&dev->mutex);
+ vhost_dev_lock_vqs(dev);
+ switch (msg->type) {
+ case VHOST_IOTLB_UPDATE:
+ if (!dev->iotlb) {
+ ret = -EFAULT;
+ break;
+ }
+ if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
+ ret = -EFAULT;
+ break;
+ }
+ if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
+ msg->iova + msg->size - 1,
+ msg->uaddr, msg->perm)) {
+ ret = -ENOMEM;
+ break;
+ }
+ vhost_iotlb_notify_vq(dev, msg);
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ vhost_del_umem_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ vhost_dev_unlock_vqs(dev);
+ mutex_unlock(&dev->mutex);
+
+ return ret;
+}
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+ struct iov_iter *from)
+{
+ struct vhost_msg_node node;
+ unsigned size = sizeof(struct vhost_msg);
+ size_t ret;
+ int err;
+
+ if (iov_iter_count(from) < size)
+ return 0;
+ ret = copy_from_iter(&node.msg, size, from);
+ if (ret != size)
+ goto done;
+
+ switch (node.msg.type) {
+ case VHOST_IOTLB_MSG:
+ err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
+ if (err)
+ ret = err;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+done:
+ return ret;
+}
+EXPORT_SYMBOL(vhost_chr_write_iter);
+
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+ poll_table *wait)
+{
+ unsigned int mask = 0;
+
+ poll_wait(file, &dev->wait, wait);
+
+ if (!list_empty(&dev->read_list))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+EXPORT_SYMBOL(vhost_chr_poll);
+
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+ int noblock)
+{
+ DEFINE_WAIT(wait);
+ struct vhost_msg_node *node;
+ ssize_t ret = 0;
+ unsigned size = sizeof(struct vhost_msg);
+
+ if (iov_iter_count(to) < size)
+ return 0;
+
+ while (1) {
+ if (!noblock)
+ prepare_to_wait(&dev->wait, &wait,
+ TASK_INTERRUPTIBLE);
+
+ node = vhost_dequeue_msg(dev, &dev->read_list);
+ if (node)
+ break;
+ if (noblock) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (!dev->iotlb) {
+ ret = -EBADFD;
+ break;
+ }
+
+ schedule();
+ }
+
+ if (!noblock)
+ finish_wait(&dev->wait, &wait);
+
+ if (node) {
+ ret = copy_to_iter(&node->msg, size, to);
+
+ if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+ kfree(node);
+ return ret;
+ }
+
+ vhost_enqueue_msg(dev, &dev->pending_list, node);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
+
+static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
+{
+ struct vhost_dev *dev = vq->dev;
+ struct vhost_msg_node *node;
+ struct vhost_iotlb_msg *msg;
+
+ node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+ if (!node)
+ return -ENOMEM;
+
+ msg = &node->msg.iotlb;
+ msg->type = VHOST_IOTLB_MISS;
+ msg->iova = iova;
+ msg->perm = access;
+
+ vhost_enqueue_msg(dev, &dev->read_list, node);
+
+ return 0;
+}
+
static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
struct vring_desc __user *desc,
struct vring_avail __user *avail,
struct vring_used __user *used)
+
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
access_ok(VERIFY_READ, avail,
sizeof *avail + num * sizeof *avail->ring + s) &&
@@ -638,11 +1105,59 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
sizeof *used + num * sizeof *used->ring + s);
}
+static int iotlb_access_ok(struct vhost_virtqueue *vq,
+ int access, u64 addr, u64 len)
+{
+ const struct vhost_umem_node *node;
+ struct vhost_umem *umem = vq->iotlb;
+ u64 s = 0, size;
+
+ while (len > s) {
+ node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+ addr,
+ addr + len - 1);
+ if (node == NULL || node->start > addr) {
+ vhost_iotlb_miss(vq, addr, access);
+ return false;
+ } else if (!(node->perm & access)) {
+ /* Report the possible access violation by
+ * request another translation from userspace.
+ */
+ return false;
+ }
+
+ size = node->size - addr + node->start;
+ s += size;
+ addr += size;
+ }
+
+ return true;
+}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+ size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+ unsigned int num = vq->num;
+
+ if (!vq->iotlb)
+ return 1;
+
+ return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ num * sizeof *vq->desc) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+ sizeof *vq->avail +
+ num * sizeof *vq->avail->ring + s) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+ sizeof *vq->used +
+ num * sizeof *vq->used->ring + s);
+}
+EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+
/* Can we log writes? */
/* Caller should have device mutex but not vq mutex */
int vhost_log_access_ok(struct vhost_dev *dev)
{
- return memory_access_ok(dev, dev->memory, 1);
+ return memory_access_ok(dev, dev->umem, 1);
}
EXPORT_SYMBOL_GPL(vhost_log_access_ok);
@@ -653,7 +1168,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
- return vq_memory_access_ok(log_base, vq->memory,
+ return vq_memory_access_ok(log_base, vq->umem,
vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
(!vq->log_used || log_access_ok(log_base, vq->log_addr,
sizeof *vq->used +
@@ -664,33 +1179,36 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
/* Caller should have vq mutex and device mutex */
int vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
- return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&
- vq_log_access_ok(vq, vq->log_base);
-}
-EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
+ if (!vq_log_access_ok(vq, vq->log_base))
+ return 0;
-static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
-{
- const struct vhost_memory_region *r1 = p1, *r2 = p2;
- if (r1->guest_phys_addr < r2->guest_phys_addr)
+ /* Access validation occurs at prefetch time with IOTLB */
+ if (vq->iotlb)
return 1;
- if (r1->guest_phys_addr > r2->guest_phys_addr)
- return -1;
- return 0;
+
+ return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
}
+EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
-static void *vhost_kvzalloc(unsigned long size)
+static struct vhost_umem *vhost_umem_alloc(void)
{
- void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
- if (!n)
- n = vzalloc(size);
- return n;
+ if (!umem)
+ return NULL;
+
+ umem->umem_tree = RB_ROOT;
+ umem->numem = 0;
+ INIT_LIST_HEAD(&umem->umem_list);
+
+ return umem;
}
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
- struct vhost_memory mem, *newmem, *oldmem;
+ struct vhost_memory mem, *newmem;
+ struct vhost_memory_region *region;
+ struct vhost_umem *newumem, *oldumem;
unsigned long size = offsetof(struct vhost_memory, regions);
int i;
@@ -710,24 +1228,47 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
kvfree(newmem);
return -EFAULT;
}
- sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
- vhost_memory_reg_sort_cmp, NULL);
- if (!memory_access_ok(d, newmem, 0)) {
+ newumem = vhost_umem_alloc();
+ if (!newumem) {
kvfree(newmem);
- return -EFAULT;
+ return -ENOMEM;
+ }
+
+ for (region = newmem->regions;
+ region < newmem->regions + mem.nregions;
+ region++) {
+ if (vhost_new_umem_range(newumem,
+ region->guest_phys_addr,
+ region->memory_size,
+ region->guest_phys_addr +
+ region->memory_size - 1,
+ region->userspace_addr,
+ VHOST_ACCESS_RW))
+ goto err;
}
- oldmem = d->memory;
- d->memory = newmem;
+
+ if (!memory_access_ok(d, newumem, 0))
+ goto err;
+
+ oldumem = d->umem;
+ d->umem = newumem;
/* All memory accesses are done under some VQ mutex. */
for (i = 0; i < d->nvqs; ++i) {
mutex_lock(&d->vqs[i]->mutex);
- d->vqs[i]->memory = newmem;
+ d->vqs[i]->umem = newumem;
mutex_unlock(&d->vqs[i]->mutex);
}
- kvfree(oldmem);
+
+ kvfree(newmem);
+ vhost_umem_clean(oldumem);
return 0;
+
+err:
+ vhost_umem_clean(newumem);
+ kvfree(newmem);
+ return -EFAULT;
}
long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
@@ -913,6 +1454,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
case VHOST_GET_VRING_ENDIAN:
r = vhost_get_vring_endian(vq, idx, argp);
break;
+ case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
+ if (copy_from_user(&s, argp, sizeof(s))) {
+ r = -EFAULT;
+ break;
+ }
+ vq->busyloop_timeout = s.num;
+ break;
+ case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
+ s.index = idx;
+ s.num = vq->busyloop_timeout;
+ if (copy_to_user(argp, &s, sizeof(s)))
+ r = -EFAULT;
+ break;
default:
r = -ENOIOCTLCMD;
}
@@ -936,6 +1490,30 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
}
EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
+{
+ struct vhost_umem *niotlb, *oiotlb;
+ int i;
+
+ niotlb = vhost_umem_alloc();
+ if (!niotlb)
+ return -ENOMEM;
+
+ oiotlb = d->iotlb;
+ d->iotlb = niotlb;
+
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
+ d->vqs[i]->iotlb = niotlb;
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
+
+ vhost_umem_clean(oiotlb);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
+
/* Caller must have device mutex */
long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
@@ -1018,28 +1596,6 @@ done:
}
EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
-static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
- __u64 addr, __u32 len)
-{
- const struct vhost_memory_region *reg;
- int start = 0, end = mem->nregions;
-
- while (start < end) {
- int slot = start + (end - start) / 2;
- reg = mem->regions + slot;
- if (addr >= reg->guest_phys_addr)
- end = slot;
- else
- start = slot + 1;
- }
-
- reg = mem->regions + start;
- if (addr >= reg->guest_phys_addr &&
- reg->guest_phys_addr + reg->memory_size > addr)
- return reg;
- return NULL;
-}
-
/* TODO: This is really inefficient. We need something like get_user()
* (instruction directly accesses the data, with an exception table entry
* returning -EFAULT). See Documentation/x86/exception-tables.txt.
@@ -1118,7 +1674,8 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
static int vhost_update_used_flags(struct vhost_virtqueue *vq)
{
void __user *used;
- if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0)
+ if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+ &vq->used->flags) < 0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1136,7 +1693,8 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
- if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)))
+ if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+ vhost_avail_event(vq)))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -1153,62 +1711,84 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
return 0;
}
-int vhost_init_used(struct vhost_virtqueue *vq)
+int vhost_vq_init_access(struct vhost_virtqueue *vq)
{
__virtio16 last_used_idx;
int r;
- if (!vq->private_data) {
- vq->is_le = virtio_legacy_is_little_endian();
+ bool is_le = vq->is_le;
+
+ if (!vq->private_data)
return 0;
- }
vhost_init_is_le(vq);
r = vhost_update_used_flags(vq);
if (r)
- return r;
+ goto err;
vq->signalled_used_valid = false;
- if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
- return -EFAULT;
- r = __get_user(last_used_idx, &vq->used->idx);
- if (r)
- return r;
+ if (!vq->iotlb &&
+ !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+ r = -EFAULT;
+ goto err;
+ }
+ r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
+ if (r) {
+ vq_err(vq, "Can't access used idx at %p\n",
+ &vq->used->idx);
+ goto err;
+ }
vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
return 0;
+
+err:
+ vq->is_le = is_le;
+ return r;
}
-EXPORT_SYMBOL_GPL(vhost_init_used);
+EXPORT_SYMBOL_GPL(vhost_vq_init_access);
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
- struct iovec iov[], int iov_size)
+ struct iovec iov[], int iov_size, int access)
{
- const struct vhost_memory_region *reg;
- struct vhost_memory *mem;
+ const struct vhost_umem_node *node;
+ struct vhost_dev *dev = vq->dev;
+ struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
struct iovec *_iov;
u64 s = 0;
int ret = 0;
- mem = vq->memory;
while ((u64)len > s) {
u64 size;
if (unlikely(ret >= iov_size)) {
ret = -ENOBUFS;
break;
}
- reg = find_region(mem, addr, len);
- if (unlikely(!reg)) {
- ret = -EFAULT;
+
+ node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+ addr, addr + len - 1);
+ if (node == NULL || node->start > addr) {
+ if (umem != dev->iotlb) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = -EAGAIN;
+ break;
+ } else if (!(node->perm & access)) {
+ ret = -EPERM;
break;
}
+
_iov = iov + ret;
- size = reg->memory_size - addr + reg->guest_phys_addr;
+ size = node->size - addr + node->start;
_iov->iov_len = min((u64)len - s, size);
_iov->iov_base = (void __user *)(unsigned long)
- (reg->userspace_addr + addr - reg->guest_phys_addr);
+ (node->userspace_addr + addr - node->start);
s += size;
addr += size;
++ret;
}
+ if (ret == -EAGAIN)
+ vhost_iotlb_miss(vq, addr, access);
return ret;
}
@@ -1243,7 +1823,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
unsigned int i = 0, count, found = 0;
u32 len = vhost32_to_cpu(vq, indirect->len);
struct iov_iter from;
- int ret;
+ int ret, access;
/* Sanity check */
if (unlikely(len % sizeof desc)) {
@@ -1255,9 +1835,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
}
ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
- UIO_MAXIOV);
+ UIO_MAXIOV, VHOST_ACCESS_RO);
if (unlikely(ret < 0)) {
- vq_err(vq, "Translation failure %d in indirect.\n", ret);
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d in indirect.\n", ret);
return ret;
}
iov_iter_init(&from, READ, vq->indirect, ret, len);
@@ -1295,16 +1876,22 @@ static int get_indirect(struct vhost_virtqueue *vq,
return -EINVAL;
}
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+
ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count);
+ iov_size - iov_count, access);
if (unlikely(ret < 0)) {
- vq_err(vq, "Translation failure %d indirect idx %d\n",
- ret, i);
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d indirect idx %d\n",
+ ret, i);
return ret;
}
/* If this is an input descriptor, increment that count. */
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+ if (access == VHOST_ACCESS_WO) {
*in_num += ret;
if (unlikely(log)) {
log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
@@ -1343,11 +1930,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
u16 last_avail_idx;
__virtio16 avail_idx;
__virtio16 ring_head;
- int ret;
+ int ret, access;
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq->last_avail_idx;
- if (unlikely(__get_user(avail_idx, &vq->avail->idx))) {
+ if (unlikely(vhost_get_user(vq, avail_idx, &vq->avail->idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
&vq->avail->idx);
return -EFAULT;
@@ -1369,8 +1956,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
- if (unlikely(__get_user(ring_head,
- &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+ if (unlikely(vhost_get_user(vq, ring_head,
+ &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
vq_err(vq, "Failed to read head: idx %d address %p\n",
last_avail_idx,
&vq->avail->ring[last_avail_idx % vq->num]);
@@ -1405,7 +1992,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
i, vq->num, head);
return -EINVAL;
}
- ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
+ ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
+ sizeof desc);
if (unlikely(ret)) {
vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
i, vq->desc + i);
@@ -1416,22 +2004,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
out_num, in_num,
log, log_num, &desc);
if (unlikely(ret < 0)) {
- vq_err(vq, "Failure detected "
- "in indirect descriptor at idx %d\n", i);
+ if (ret != -EAGAIN)
+ vq_err(vq, "Failure detected "
+ "in indirect descriptor at idx %d\n", i);
return ret;
}
continue;
}
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count);
+ iov_size - iov_count, access);
if (unlikely(ret < 0)) {
- vq_err(vq, "Translation failure %d descriptor idx %d\n",
- ret, i);
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d descriptor idx %d\n",
+ ret, i);
return ret;
}
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+ if (access == VHOST_ACCESS_WO) {
/* If this is an input descriptor,
* increment that count. */
*in_num += ret;
@@ -1493,15 +2087,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
start = vq->last_used_idx & (vq->num - 1);
used = vq->used->ring + start;
if (count == 1) {
- if (__put_user(heads[0].id, &used->id)) {
+ if (vhost_put_user(vq, heads[0].id, &used->id)) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
- if (__put_user(heads[0].len, &used->len)) {
+ if (vhost_put_user(vq, heads[0].len, &used->len)) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
- } else if (__copy_to_user(used, heads, count * sizeof *used)) {
+ } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
vq_err(vq, "Failed to write used");
return -EFAULT;
}
@@ -1545,7 +2139,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
- if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
+ if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+ &vq->used->idx)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
@@ -1579,7 +2174,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
__virtio16 flags;
- if (__get_user(flags, &vq->avail->flags)) {
+ if (vhost_get_user(vq, flags, &vq->avail->flags)) {
vq_err(vq, "Failed to get flags");
return true;
}
@@ -1593,7 +2188,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
if (unlikely(!v))
return true;
- if (__get_user(event, vhost_used_event(vq))) {
+ if (vhost_get_user(vq, event, vhost_used_event(vq))) {
vq_err(vq, "Failed to get used event idx");
return true;
}
@@ -1629,6 +2224,20 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
+/* return true if we're sure that avaiable ring is empty */
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ __virtio16 avail_idx;
+ int r;
+
+ r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
+ if (r)
+ return false;
+
+ return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+}
+EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
+
/* OK, now we need to know about added descriptors. */
bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
@@ -1656,7 +2265,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again. */
smp_mb();
- r = __get_user(avail_idx, &vq->avail->idx);
+ r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
if (r) {
vq_err(vq, "Failed to check avail idx at %p: %d\n",
&vq->avail->idx, r);
@@ -1684,6 +2293,50 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_disable_notify);
+/* Create a new message. */
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
+{
+ struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+ if (!node)
+ return NULL;
+
+ /* Make sure all padding within the structure is initialized. */
+ memset(&node->msg, 0, sizeof node->msg);
+ node->vq = vq;
+ node->msg.type = type;
+ return node;
+}
+EXPORT_SYMBOL_GPL(vhost_new_msg);
+
+void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
+ struct vhost_msg_node *node)
+{
+ spin_lock(&dev->iotlb_lock);
+ list_add_tail(&node->node, head);
+ spin_unlock(&dev->iotlb_lock);
+
+ wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+}
+EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
+
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+ struct list_head *head)
+{
+ struct vhost_msg_node *node = NULL;
+
+ spin_lock(&dev->iotlb_lock);
+ if (!list_empty(head)) {
+ node = list_first_entry(head, struct vhost_msg_node,
+ node);
+ list_del(&node->node);
+ }
+ spin_unlock(&dev->iotlb_lock);
+
+ return node;
+}
+EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
+
+
static int __init vhost_init(void)
{
return 0;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d3f767448a72..78f3c5fc02e4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -15,13 +15,15 @@
struct vhost_work;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+#define VHOST_WORK_QUEUED 1
struct vhost_work {
- struct list_head node;
+ struct llist_node node;
vhost_work_fn_t fn;
wait_queue_head_t done;
int flushing;
unsigned queue_seq;
unsigned done_seq;
+ unsigned long flags;
};
/* Poll a file (eventfd or socket) */
@@ -37,6 +39,7 @@ struct vhost_poll {
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_has_work(struct vhost_dev *dev);
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
unsigned long mask, struct vhost_dev *dev);
@@ -52,6 +55,27 @@ struct vhost_log {
u64 len;
};
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+struct vhost_umem_node {
+ struct rb_node rb;
+ struct list_head link;
+ __u64 start;
+ __u64 last;
+ __u64 size;
+ __u64 userspace_addr;
+ __u32 perm;
+ __u32 flags_padding;
+ __u64 __subtree_last;
+};
+
+struct vhost_umem {
+ struct rb_root umem_tree;
+ struct list_head umem_list;
+ int numem;
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -97,10 +121,12 @@ struct vhost_virtqueue {
u64 log_addr;
struct iovec iov[UIO_MAXIOV];
+ struct iovec iotlb_iov[64];
struct iovec *indirect;
struct vring_used_elem *heads;
/* Protected by virtqueue mutex. */
- struct vhost_memory *memory;
+ struct vhost_umem *umem;
+ struct vhost_umem *iotlb;
void *private_data;
u64 acked_features;
/* Log write descriptors */
@@ -114,27 +140,38 @@ struct vhost_virtqueue {
/* Ring endianness requested by userspace for cross-endian support. */
bool user_be;
#endif
+ u32 busyloop_timeout;
+};
+
+struct vhost_msg_node {
+ struct vhost_msg msg;
+ struct vhost_virtqueue *vq;
+ struct list_head node;
};
struct vhost_dev {
- struct vhost_memory *memory;
struct mm_struct *mm;
struct mutex mutex;
struct vhost_virtqueue **vqs;
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
- spinlock_t work_lock;
- struct list_head work_list;
+ struct llist_head work_list;
struct task_struct *worker;
+ struct vhost_umem *umem;
+ struct vhost_umem *iotlb;
+ spinlock_t iotlb_lock;
+ struct list_head read_list;
+ struct list_head pending_list;
+ wait_queue_head_t wait;
};
void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
long vhost_dev_set_owner(struct vhost_dev *dev);
bool vhost_dev_has_owner(struct vhost_dev *dev);
long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
+struct vhost_umem *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
void vhost_dev_cleanup(struct vhost_dev *, bool locked);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
@@ -148,7 +185,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
struct vhost_log *log, unsigned int *log_num);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
-int vhost_init_used(struct vhost_virtqueue *);
+int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
unsigned count);
@@ -158,10 +195,26 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
+void vhost_enqueue_msg(struct vhost_dev *dev,
+ struct list_head *head,
+ struct vhost_msg_node *node);
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+ struct list_head *head);
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+ poll_table *wait);
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+ int noblock);
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+ struct iov_iter *from);
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
new file mode 100644
index 000000000000..72e914de473e
--- /dev/null
+++ b/drivers/vhost/vsock.c
@@ -0,0 +1,797 @@
+/*
+ * vhost transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/virtio_vsock.h>
+#include <linux/vhost.h>
+#include <linux/hashtable.h>
+
+#include <net/af_vsock.h>
+#include "vhost.h"
+
+#define VHOST_VSOCK_DEFAULT_HOST_CID 2
+
+enum {
+ VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+};
+
+/* Used to track all the vhost_vsock instances on the system. */
+static DEFINE_SPINLOCK(vhost_vsock_lock);
+static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
+
+struct vhost_vsock {
+ struct vhost_dev dev;
+ struct vhost_virtqueue vqs[2];
+
+ /* Link to global vhost_vsock_hash, writes use vhost_vsock_lock */
+ struct hlist_node hash;
+
+ struct vhost_work send_pkt_work;
+ spinlock_t send_pkt_list_lock;
+ struct list_head send_pkt_list; /* host->guest pending packets */
+
+ atomic_t queued_replies;
+
+ u32 guest_cid;
+};
+
+static u32 vhost_transport_get_local_cid(void)
+{
+ return VHOST_VSOCK_DEFAULT_HOST_CID;
+}
+
+/* Callers that dereference the return value must hold vhost_vsock_lock or the
+ * RCU read lock.
+ */
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+{
+ struct vhost_vsock *vsock;
+
+ hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
+ u32 other_cid = vsock->guest_cid;
+
+ /* Skip instances that have no CID yet */
+ if (other_cid == 0)
+ continue;
+
+ if (other_cid == guest_cid) {
+ return vsock;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+ struct vhost_virtqueue *vq)
+{
+ struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+ bool added = false;
+ bool restart_tx = false;
+
+ mutex_lock(&vq->mutex);
+
+ if (!vq->private_data)
+ goto out;
+
+ /* Avoid further vmexits, we're already processing the virtqueue */
+ vhost_disable_notify(&vsock->dev, vq);
+
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ struct iov_iter iov_iter;
+ unsigned out, in;
+ size_t nbytes;
+ size_t len;
+ int head;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ if (list_empty(&vsock->send_pkt_list)) {
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ vhost_enable_notify(&vsock->dev, vq);
+ break;
+ }
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (head < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ if (head == vq->num) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ /* We cannot finish yet if more buffers snuck in while
+ * re-enabling notify.
+ */
+ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+ vhost_disable_notify(&vsock->dev, vq);
+ continue;
+ }
+ break;
+ }
+
+ if (out) {
+ virtio_transport_free_pkt(pkt);
+ vq_err(vq, "Expected 0 output buffers, got %u\n", out);
+ break;
+ }
+
+ len = iov_length(&vq->iov[out], in);
+ iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
+
+ nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+ if (nbytes != sizeof(pkt->hdr)) {
+ virtio_transport_free_pkt(pkt);
+ vq_err(vq, "Faulted on copying pkt hdr\n");
+ break;
+ }
+
+ nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
+ if (nbytes != pkt->len) {
+ virtio_transport_free_pkt(pkt);
+ vq_err(vq, "Faulted on copying pkt buf\n");
+ break;
+ }
+
+ vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+ added = true;
+
+ if (pkt->reply) {
+ int val;
+
+ val = atomic_dec_return(&vsock->queued_replies);
+
+ /* Do we have resources to resume tx processing? */
+ if (val + 1 == tx_vq->num)
+ restart_tx = true;
+ }
+
+ virtio_transport_free_pkt(pkt);
+ }
+ if (added)
+ vhost_signal(&vsock->dev, vq);
+
+out:
+ mutex_unlock(&vq->mutex);
+
+ if (restart_tx)
+ vhost_poll_queue(&tx_vq->poll);
+}
+
+static void vhost_transport_send_pkt_work(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq;
+ struct vhost_vsock *vsock;
+
+ vsock = container_of(work, struct vhost_vsock, send_pkt_work);
+ vq = &vsock->vqs[VSOCK_VQ_RX];
+
+ vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int
+vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct vhost_vsock *vsock;
+ struct vhost_virtqueue *vq;
+ int len = pkt->len;
+
+ rcu_read_lock();
+
+ /* Find the vhost_vsock according to guest context id */
+ vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
+ if (!vsock) {
+ rcu_read_unlock();
+ virtio_transport_free_pkt(pkt);
+ return -ENODEV;
+ }
+
+ vq = &vsock->vqs[VSOCK_VQ_RX];
+
+ if (pkt->reply)
+ atomic_inc(&vsock->queued_replies);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+
+ rcu_read_unlock();
+ return len;
+}
+
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ struct vhost_vsock *vsock;
+ struct virtio_vsock_pkt *pkt, *n;
+ int cnt = 0;
+ int ret = -ENODEV;
+ LIST_HEAD(freeme);
+
+ rcu_read_lock();
+
+ /* Find the vhost_vsock according to guest context id */
+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+ if (!vsock)
+ goto out;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+ if (pkt->vsk != vsk)
+ continue;
+ list_move(&pkt->list, &freeme);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ list_for_each_entry_safe(pkt, n, &freeme, list) {
+ if (pkt->reply)
+ cnt++;
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+
+ if (cnt) {
+ struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+ int new_cnt;
+
+ new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+ if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+ vhost_poll_queue(&tx_vq->poll);
+ }
+
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct virtio_vsock_pkt *
+vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
+ unsigned int out, unsigned int in)
+{
+ struct virtio_vsock_pkt *pkt;
+ struct iov_iter iov_iter;
+ size_t nbytes;
+ size_t len;
+
+ if (in != 0) {
+ vq_err(vq, "Expected 0 input buffers, got %u\n", in);
+ return NULL;
+ }
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return NULL;
+
+ len = iov_length(vq->iov, out);
+ iov_iter_init(&iov_iter, WRITE, vq->iov, out, len);
+
+ nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+ if (nbytes != sizeof(pkt->hdr)) {
+ vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
+ sizeof(pkt->hdr), nbytes);
+ kfree(pkt);
+ return NULL;
+ }
+
+ if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+
+ /* No payload */
+ if (!pkt->len)
+ return pkt;
+
+ /* The pkt is too big */
+ if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+ kfree(pkt);
+ return NULL;
+ }
+
+ pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
+ if (!pkt->buf) {
+ kfree(pkt);
+ return NULL;
+ }
+
+ nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+ if (nbytes != pkt->len) {
+ vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
+ pkt->len, nbytes);
+ virtio_transport_free_pkt(pkt);
+ return NULL;
+ }
+
+ return pkt;
+}
+
+/* Is there space left for replies to rx packets? */
+static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
+{
+ struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
+ int val;
+
+ smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+ val = atomic_read(&vsock->queued_replies);
+
+ return val < vq->num;
+}
+
+static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+ dev);
+ struct virtio_vsock_pkt *pkt;
+ int head;
+ unsigned int out, in;
+ bool added = false;
+
+ mutex_lock(&vq->mutex);
+
+ if (!vq->private_data)
+ goto out;
+
+ vhost_disable_notify(&vsock->dev, vq);
+ for (;;) {
+ u32 len;
+
+ if (!vhost_vsock_more_replies(vsock)) {
+ /* Stop tx until the device processes already
+ * pending replies. Leave tx virtqueue
+ * callbacks disabled.
+ */
+ goto no_more_replies;
+ }
+
+ head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (head < 0)
+ break;
+
+ if (head == vq->num) {
+ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+ vhost_disable_notify(&vsock->dev, vq);
+ continue;
+ }
+ break;
+ }
+
+ pkt = vhost_vsock_alloc_pkt(vq, out, in);
+ if (!pkt) {
+ vq_err(vq, "Faulted on pkt\n");
+ continue;
+ }
+
+ len = pkt->len;
+
+ /* Only accept correctly addressed packets */
+ if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
+ virtio_transport_recv_pkt(pkt);
+ else
+ virtio_transport_free_pkt(pkt);
+
+ vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+ added = true;
+ }
+
+no_more_replies:
+ if (added)
+ vhost_signal(&vsock->dev, vq);
+
+out:
+ mutex_unlock(&vq->mutex);
+}
+
+static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+ dev);
+
+ vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int vhost_vsock_start(struct vhost_vsock *vsock)
+{
+ struct vhost_virtqueue *vq;
+ size_t i;
+ int ret;
+
+ mutex_lock(&vsock->dev.mutex);
+
+ ret = vhost_dev_check_owner(&vsock->dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+ vq = &vsock->vqs[i];
+
+ mutex_lock(&vq->mutex);
+
+ if (!vhost_vq_access_ok(vq)) {
+ ret = -EFAULT;
+ goto err_vq;
+ }
+
+ if (!vq->private_data) {
+ vq->private_data = vsock;
+ ret = vhost_vq_init_access(vq);
+ if (ret)
+ goto err_vq;
+ }
+
+ mutex_unlock(&vq->mutex);
+ }
+
+ mutex_unlock(&vsock->dev.mutex);
+ return 0;
+
+err_vq:
+ vq->private_data = NULL;
+ mutex_unlock(&vq->mutex);
+
+ for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+ vq = &vsock->vqs[i];
+
+ mutex_lock(&vq->mutex);
+ vq->private_data = NULL;
+ mutex_unlock(&vq->mutex);
+ }
+err:
+ mutex_unlock(&vsock->dev.mutex);
+ return ret;
+}
+
+static int vhost_vsock_stop(struct vhost_vsock *vsock)
+{
+ size_t i;
+ int ret;
+
+ mutex_lock(&vsock->dev.mutex);
+
+ ret = vhost_dev_check_owner(&vsock->dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+ struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+ mutex_lock(&vq->mutex);
+ vq->private_data = NULL;
+ mutex_unlock(&vq->mutex);
+ }
+
+err:
+ mutex_unlock(&vsock->dev.mutex);
+ return ret;
+}
+
+static void vhost_vsock_free(struct vhost_vsock *vsock)
+{
+ kvfree(vsock);
+}
+
+static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+{
+ struct vhost_virtqueue **vqs;
+ struct vhost_vsock *vsock;
+ int ret;
+
+ /* This struct is large and allocation could fail, fall back to vmalloc
+ * if there is no other way.
+ */
+ vsock = kzalloc(sizeof(*vsock), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!vsock) {
+ vsock = vmalloc(sizeof(*vsock));
+ if (!vsock)
+ return -ENOMEM;
+ }
+
+ vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vsock->guest_cid = 0; /* no CID assigned yet */
+
+ atomic_set(&vsock->queued_replies, 0);
+
+ vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
+ vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
+ vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+ vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+
+ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
+
+ file->private_data = vsock;
+ spin_lock_init(&vsock->send_pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->send_pkt_list);
+ vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
+ return 0;
+
+out:
+ vhost_vsock_free(vsock);
+ return ret;
+}
+
+static void vhost_vsock_flush(struct vhost_vsock *vsock)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
+ if (vsock->vqs[i].handle_kick)
+ vhost_poll_flush(&vsock->vqs[i].poll);
+ vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
+}
+
+static void vhost_vsock_reset_orphans(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ /* vmci_transport.c doesn't take sk_lock here either. At least we're
+ * under vsock_table_lock so the sock cannot disappear while we're
+ * executing.
+ */
+
+ /* If the peer is still valid, no need to reset connection */
+ if (vhost_vsock_get(vsk->remote_addr.svm_cid))
+ return;
+
+ /* If the close timeout is pending, let it expire. This avoids races
+ * with the timeout callback.
+ */
+ if (vsk->close_work_scheduled)
+ return;
+
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+}
+
+static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
+{
+ struct vhost_vsock *vsock = file->private_data;
+
+ spin_lock_bh(&vhost_vsock_lock);
+ if (vsock->guest_cid)
+ hash_del_rcu(&vsock->hash);
+ spin_unlock_bh(&vhost_vsock_lock);
+
+ /* Wait for other CPUs to finish using vsock */
+ synchronize_rcu();
+
+ /* Iterating over all connections for all CIDs to find orphans is
+ * inefficient. Room for improvement here. */
+ vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
+
+ vhost_vsock_stop(vsock);
+ vhost_vsock_flush(vsock);
+ vhost_dev_stop(&vsock->dev);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ while (!list_empty(&vsock->send_pkt_list)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ vhost_dev_cleanup(&vsock->dev, false);
+ kfree(vsock->dev.vqs);
+ vhost_vsock_free(vsock);
+ return 0;
+}
+
+static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
+{
+ struct vhost_vsock *other;
+
+ /* Refuse reserved CIDs */
+ if (guest_cid <= VMADDR_CID_HOST ||
+ guest_cid == U32_MAX)
+ return -EINVAL;
+
+ /* 64-bit CIDs are not yet supported */
+ if (guest_cid > U32_MAX)
+ return -EINVAL;
+
+ /* Refuse if CID is already in use */
+ spin_lock_bh(&vhost_vsock_lock);
+ other = vhost_vsock_get(guest_cid);
+ if (other && other != vsock) {
+ spin_unlock_bh(&vhost_vsock_lock);
+ return -EADDRINUSE;
+ }
+
+ if (vsock->guest_cid)
+ hash_del_rcu(&vsock->hash);
+
+ vsock->guest_cid = guest_cid;
+ hash_add_rcu(vhost_vsock_hash, &vsock->hash, guest_cid);
+ spin_unlock_bh(&vhost_vsock_lock);
+
+ return 0;
+}
+
+static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
+{
+ struct vhost_virtqueue *vq;
+ int i;
+
+ if (features & ~VHOST_VSOCK_FEATURES)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&vsock->dev.mutex);
+ if ((features & (1 << VHOST_F_LOG_ALL)) &&
+ !vhost_log_access_ok(&vsock->dev)) {
+ mutex_unlock(&vsock->dev.mutex);
+ return -EFAULT;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+ vq = &vsock->vqs[i];
+ mutex_lock(&vq->mutex);
+ vq->acked_features = features;
+ mutex_unlock(&vq->mutex);
+ }
+ mutex_unlock(&vsock->dev.mutex);
+ return 0;
+}
+
+static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct vhost_vsock *vsock = f->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 guest_cid;
+ u64 features;
+ int start;
+ int r;
+
+ switch (ioctl) {
+ case VHOST_VSOCK_SET_GUEST_CID:
+ if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
+ return -EFAULT;
+ return vhost_vsock_set_cid(vsock, guest_cid);
+ case VHOST_VSOCK_SET_RUNNING:
+ if (copy_from_user(&start, argp, sizeof(start)))
+ return -EFAULT;
+ if (start)
+ return vhost_vsock_start(vsock);
+ else
+ return vhost_vsock_stop(vsock);
+ case VHOST_GET_FEATURES:
+ features = VHOST_VSOCK_FEATURES;
+ if (copy_to_user(argp, &features, sizeof(features)))
+ return -EFAULT;
+ return 0;
+ case VHOST_SET_FEATURES:
+ if (copy_from_user(&features, argp, sizeof(features)))
+ return -EFAULT;
+ return vhost_vsock_set_features(vsock, features);
+ default:
+ mutex_lock(&vsock->dev.mutex);
+ r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
+ else
+ vhost_vsock_flush(vsock);
+ mutex_unlock(&vsock->dev.mutex);
+ return r;
+ }
+}
+
+static const struct file_operations vhost_vsock_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_vsock_dev_open,
+ .release = vhost_vsock_dev_release,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = vhost_vsock_dev_ioctl,
+};
+
+static struct miscdevice vhost_vsock_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "vhost-vsock",
+ .fops = &vhost_vsock_fops,
+};
+
+static struct virtio_transport vhost_transport = {
+ .transport = {
+ .get_local_cid = vhost_transport_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+ .cancel_pkt = vhost_transport_cancel_pkt,
+
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+ .set_buffer_size = virtio_transport_set_buffer_size,
+ .set_min_buffer_size = virtio_transport_set_min_buffer_size,
+ .set_max_buffer_size = virtio_transport_set_max_buffer_size,
+ .get_buffer_size = virtio_transport_get_buffer_size,
+ .get_min_buffer_size = virtio_transport_get_min_buffer_size,
+ .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ },
+
+ .send_pkt = vhost_transport_send_pkt,
+};
+
+static int __init vhost_vsock_init(void)
+{
+ int ret;
+
+ ret = vsock_core_init(&vhost_transport.transport);
+ if (ret < 0)
+ return ret;
+ return misc_register(&vhost_vsock_misc);
+};
+
+static void __exit vhost_vsock_exit(void)
+{
+ misc_deregister(&vhost_vsock_misc);
+ vsock_core_exit();
+};
+
+module_init(vhost_vsock_init);
+module_exit(vhost_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("vhost transport for vsock ");
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cab9f3f63a38..77590320d44c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
config VIRTIO_MMIO
tristate "Platform bus driver for memory mapped virtio devices"
- depends on HAS_IOMEM
+ depends on HAS_IOMEM && HAS_DMA
select VIRTIO
---help---
This drivers provides support for memory mapped virtio
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7d4c7f35e5cf..f77358f08930 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -401,7 +401,7 @@ static int init_vqs(struct virtio_balloon *vb)
{
struct virtqueue *vqs[3];
vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
- const char *names[] = { "inflate", "deflate", "stats" };
+ static const char * const names[] = { "inflate", "deflate", "stats" };
int err, nvqs;
/*
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index c96944b59856..350a2a5a49db 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -170,7 +170,7 @@ static int virtinput_init_vqs(struct virtio_input *vi)
struct virtqueue *vqs[2];
vq_callback_t *cbs[] = { virtinput_recv_events,
virtinput_recv_status };
- static const char *names[] = { "events", "status" };
+ static const char * const names[] = { "events", "status" };
int err;
err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names);
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index f499d9da7237..745c6ee1bb3e 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -482,7 +482,7 @@ error_available:
static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
unsigned int irq = platform_get_irq(vm_dev->pdev, 0);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 2046a68ad0ba..f6bed86c17f9 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -296,7 +296,7 @@ void vp_del_vqs(struct virtio_device *vdev)
static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[],
+ const char * const names[],
bool use_msix,
bool per_vq_vectors)
{
@@ -376,7 +376,7 @@ error_find:
int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
int err;
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index b976d968e793..2cc252270b2d 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -139,7 +139,7 @@ void vp_del_vqs(struct virtio_device *vdev);
int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[]);
+ const char * const names[]);
const char *vp_bus_name(struct virtio_device *vdev);
/* Setup the affinity for a virtqueue:
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 4469202eaa8e..631021cfc740 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -423,7 +423,7 @@ err_new_queue:
static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[])
+ const char * const names[])
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
struct virtqueue *vq;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a01a41a41269..761f28ffd40e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,8 @@
#include <linux/module.h>
#include <linux/hrtimer.h>
#include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
+#include <xen/xen.h>
#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +56,11 @@
#define END_USE(vq)
#endif
+struct vring_desc_state {
+ void *data; /* Data for callback. */
+ struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
+};
+
struct vring_virtqueue {
struct virtqueue vq;
@@ -98,12 +105,131 @@ struct vring_virtqueue {
ktime_t last_add_time;
#endif
- /* Tokens for callbacks. */
- void *data[];
+ /* Per-descriptor state. */
+ struct vring_desc_state desc_state[];
};
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+/*
+ * Modern virtio devices have feature bits to specify whether they need a
+ * quirk and bypass the IOMMU. If not there, just use the DMA API.
+ *
+ * If there, the interaction between virtio and DMA API is messy.
+ *
+ * On most systems with virtio, physical addresses match bus addresses,
+ * and it doesn't particularly matter whether we use the DMA API.
+ *
+ * On some systems, including Xen and any system with a physical device
+ * that speaks virtio behind a physical IOMMU, we must use the DMA API
+ * for virtio DMA to work at all.
+ *
+ * On other systems, including SPARC and PPC64, virtio-pci devices are
+ * enumerated as though they are behind an IOMMU, but the virtio host
+ * ignores the IOMMU, so we must either pretend that the IOMMU isn't
+ * there or somehow map everything as the identity.
+ *
+ * For the time being, we preserve historic behavior and bypass the DMA
+ * API.
+ *
+ * TODO: install a per-device DMA ops structure that does the right thing
+ * taking into account all the above quirks, and use the DMA API
+ * unconditionally on data path.
+ */
+
+static bool vring_use_dma_api(struct virtio_device *vdev)
+{
+ if (!virtio_has_iommu_quirk(vdev))
+ return true;
+
+ /* Otherwise, we are left to guess. */
+ /*
+ * In theory, it's possible to have a buggy QEMU-supposed
+ * emulated Q35 IOMMU and Xen enabled at the same time. On
+ * such a configuration, virtio has never worked and will
+ * not work without an even larger kludge. Instead, enable
+ * the DMA API if we're a Xen guest, which at least allows
+ * all of the sensible Xen configurations to work correctly.
+ */
+ if (xen_domain())
+ return true;
+
+ return false;
+}
+
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess. For now, we use the parent device for DMA ops.
+ */
+struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+ return vq->vq.vdev->dev.parent;
+}
+
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+ struct scatterlist *sg,
+ enum dma_data_direction direction)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return (dma_addr_t)sg_phys(sg);
+
+ /*
+ * We can't use dma_map_sg, because we don't use scatterlists in
+ * the way it expects (we don't guarantee that the scatterlist
+ * will exist for the lifetime of the mapping).
+ */
+ return dma_map_page(vring_dma_dev(vq),
+ sg_page(sg), sg->offset, sg->length,
+ direction);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return (dma_addr_t)virt_to_phys(cpu_addr);
+
+ return dma_map_single(vring_dma_dev(vq),
+ cpu_addr, size, direction);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+ struct vring_desc *desc)
+{
+ u16 flags;
+
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return;
+
+ flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+ if (flags & VRING_DESC_F_INDIRECT) {
+ dma_unmap_single(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ } else {
+ dma_unmap_page(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+ dma_addr_t addr)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return 0;
+
+ return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
unsigned int total_sg, gfp_t gfp)
{
@@ -137,7 +263,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
struct vring_desc *desc;
- unsigned int i, n, avail, descs_used, uninitialized_var(prev);
+ unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
int head;
bool indirect;
@@ -177,21 +303,15 @@ static inline int virtqueue_add(struct virtqueue *_vq,
if (desc) {
/* Use a single buffer which doesn't continue */
- vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
- vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
- /* avoid kmemleak false positive (hidden by virt_to_phys) */
- kmemleak_ignore(desc);
- vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
-
+ indirect = true;
/* Set up rest to use this indirect table. */
i = 0;
descs_used = 1;
- indirect = true;
} else {
+ indirect = false;
desc = vq->vring.desc;
i = head;
descs_used = total_sg;
- indirect = false;
}
if (vq->vq.num_free < descs_used) {
@@ -208,13 +328,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
return -ENOSPC;
}
- /* We're about to use some buffers from the free list. */
- vq->vq.num_free -= descs_used;
-
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
- desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+ desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -222,8 +343,12 @@ static inline int virtqueue_add(struct virtqueue *_vq,
}
for (; n < (out_sgs + in_sgs); n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
- desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+ desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -232,14 +357,33 @@ static inline int virtqueue_add(struct virtqueue *_vq,
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+ if (indirect) {
+ /* Now that the indirect table is filled in, map it. */
+ dma_addr_t addr = vring_map_single(
+ vq, desc, total_sg * sizeof(struct vring_desc),
+ DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
+ vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
+ vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
+
+ vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
+ }
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= descs_used;
+
/* Update free pointer */
if (indirect)
vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
else
vq->free_head = i;
- /* Set token. */
- vq->data[head] = data;
+ /* Store token and indirect buffer state. */
+ vq->desc_state[head].data = data;
+ if (indirect)
+ vq->desc_state[head].indir_desc = desc;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
@@ -262,6 +406,24 @@ static inline int virtqueue_add(struct virtqueue *_vq,
virtqueue_kick(_vq);
return 0;
+
+unmap_release:
+ err_idx = i;
+ i = head;
+
+ for (n = 0; n < total_sg; n++) {
+ if (i == err_idx)
+ break;
+ vring_unmap_one(vq, &desc[i]);
+ i = vq->vring.desc[i].next;
+ }
+
+ vq->vq.num_free += total_sg;
+
+ if (indirect)
+ kfree(desc);
+
+ return -EIO;
}
/**
@@ -432,27 +594,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
{
- unsigned int i;
+ unsigned int i, j;
+ u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
/* Clear data ptr. */
- vq->data[head] = NULL;
+ vq->desc_state[head].data = NULL;
- /* Put back on free list: find end */
+ /* Put back on free list: unmap first-level descriptors and find end */
i = head;
- /* Free the indirect table */
- if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
- kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
-
- while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
+ while (vq->vring.desc[i].flags & nextflag) {
+ vring_unmap_one(vq, &vq->vring.desc[i]);
i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
vq->vq.num_free++;
}
+ vring_unmap_one(vq, &vq->vring.desc[i]);
vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
vq->free_head = head;
+
/* Plus final descriptor */
vq->vq.num_free++;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ if (vq->desc_state[head].indir_desc) {
+ struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+ u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+ BUG_ON(!(vq->vring.desc[head].flags &
+ cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+ BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+ for (j = 0; j < len / sizeof(struct vring_desc); j++)
+ vring_unmap_one(vq, &indir_desc[j]);
+
+ kfree(vq->desc_state[head].indir_desc);
+ vq->desc_state[head].indir_desc = NULL;
+ }
}
static inline bool more_used(const struct vring_virtqueue *vq)
@@ -507,13 +685,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
BAD_RING(vq, "id %u out of range\n", i);
return NULL;
}
- if (unlikely(!vq->data[i])) {
+ if (unlikely(!vq->desc_state[i].data)) {
BAD_RING(vq, "id %u is not a head!\n", i);
return NULL;
}
/* detach_buf clears data, so grab it now. */
- ret = vq->data[i];
+ ret = vq->desc_state[i].data;
detach_buf(vq, i);
vq->last_used_idx++;
/* If we expect an interrupt for the next entry, tell host
@@ -687,10 +865,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
START_USE(vq);
for (i = 0; i < vq->vring.num; i++) {
- if (!vq->data[i])
+ if (!vq->desc_state[i].data)
continue;
/* detach_buf clears data, so grab it now. */
- buf = vq->data[i];
+ buf = vq->desc_state[i].data;
detach_buf(vq, i);
vq->avail_idx_shadow--;
vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
@@ -744,7 +922,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
return NULL;
}
- vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+ vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+ GFP_KERNEL);
if (!vq)
return NULL;
@@ -779,11 +958,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
/* Put everything in free lists. */
vq->free_head = 0;
- for (i = 0; i < num-1; i++) {
+ for (i = 0; i < num-1; i++)
vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
- vq->data[i] = NULL;
- }
- vq->data[i] = NULL;
+ memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
return &vq->vq;
}
@@ -809,6 +986,8 @@ void vring_transport_features(struct virtio_device *vdev)
break;
case VIRTIO_F_VERSION_1:
break;
+ case VIRTIO_F_IOMMU_PLATFORM:
+ break;
default:
/* We don't understand this bit. */
__virtio_clear_bit(vdev, i);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..c792df826e12 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 38ee08675468..8f4baa3cb992 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1726,20 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return err;
}
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- u32 nr = btrfs_header_nritems(leaf);
- if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(root);
- return btrfs_item_offset_nr(leaf, nr - 1);
-}
-
/*
* search for key in the extent_buffer. The items start at offset p,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e847573c6db0..4a91d3119e59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -897,6 +898,7 @@ struct btrfs_balance_item {
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES 2
struct btrfs_file_extent_item {
/*
@@ -2283,7 +2285,7 @@ do { \
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
struct btrfs_map_token {
- struct extent_buffer *eb;
+ const struct extent_buffer *eb;
char *kaddr;
unsigned long offset;
};
@@ -2314,18 +2316,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
sizeof(((type *)0)->member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
- unsigned long off, \
- struct btrfs_map_token *token); \
-void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off, \
+ struct btrfs_map_token *token); \
+void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \
unsigned long off, u##bits val, \
struct btrfs_map_token *token); \
-static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, \
unsigned long off) \
{ \
return btrfs_get_token_##bits(eb, ptr, off, NULL); \
} \
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
unsigned long off, u##bits val) \
{ \
btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
@@ -2337,7 +2340,8 @@ DECLARE_BTRFS_SETGET_BITS(32)
DECLARE_BTRFS_SETGET_BITS(64)
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
@@ -2348,7 +2352,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
-static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
+ const type *s, \
struct btrfs_map_token *token) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
@@ -2363,9 +2368,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(struct extent_buffer *eb) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- type *p = page_address(eb->pages[0]); \
+ const type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \
return res; \
} \
@@ -2377,7 +2382,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(type *s) \
+static inline u##bits btrfs_##name(const type *s) \
{ \
return le##bits##_to_cpu(s->member); \
} \
@@ -2678,7 +2683,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
sizeof(struct btrfs_key_ptr) * nr;
}
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
static inline void btrfs_set_node_key(struct extent_buffer *eb,
@@ -2707,28 +2712,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(struct extent_buffer *eb,
+static inline u32 btrfs_item_end(const struct extent_buffer *eb,
struct btrfs_item *item)
{
return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
}
-static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_end(eb, btrfs_item_nr(nr));
}
-static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_offset(eb, btrfs_item_nr(nr));
}
-static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_size(eb, btrfs_item_nr(nr));
}
-static inline void btrfs_item_key(struct extent_buffer *eb,
+static inline void btrfs_item_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
struct btrfs_item *item = btrfs_item_nr(nr);
@@ -2764,8 +2769,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
transid, 64);
-static inline void btrfs_dir_item_key(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
struct btrfs_disk_key *key)
{
read_eb_member(eb, item, struct btrfs_dir_item, location, key);
@@ -2773,7 +2778,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb,
static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
struct btrfs_dir_item *item,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, item, struct btrfs_dir_item, location, key);
}
@@ -2785,8 +2790,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
generation, 64);
-static inline void btrfs_free_space_key(struct extent_buffer *eb,
- struct btrfs_free_space_header *h,
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+ const struct btrfs_free_space_header *h,
struct btrfs_disk_key *key)
{
read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
@@ -2794,7 +2799,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb,
static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
struct btrfs_free_space_header *h,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
}
@@ -2821,25 +2826,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
disk->objectid = cpu_to_le64(cpu->objectid);
}
-static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
{
struct btrfs_disk_key disk_key;
btrfs_node_key(eb, &disk_key, nr);
btrfs_disk_key_to_cpu(key, &disk_key);
}
-static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
{
struct btrfs_disk_key disk_key;
btrfs_item_key(eb, &disk_key, nr);
btrfs_disk_key_to_cpu(key, &disk_key);
}
-static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
- struct btrfs_key *key)
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *key)
{
struct btrfs_disk_key disk_key;
btrfs_dir_item_key(eb, item, &disk_key);
@@ -2872,7 +2877,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
nritems, 32);
BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
-static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
{
return (btrfs_header_flags(eb) & flag) == flag;
}
@@ -2891,7 +2896,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
return (flags & flag) == flag;
}
-static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
{
u64 flags = btrfs_header_flags(eb);
return flags >> BTRFS_BACKREF_REV_SHIFT;
@@ -2911,12 +2916,12 @@ static inline unsigned long btrfs_header_fsid(void)
return offsetof(struct btrfs_header, fsid);
}
-static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
{
return offsetof(struct btrfs_header, chunk_tree_uuid);
}
-static inline int btrfs_is_leaf(struct extent_buffer *eb)
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
}
@@ -2950,12 +2955,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
rtransid, 64);
-static inline bool btrfs_root_readonly(struct btrfs_root *root)
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
-static inline bool btrfs_root_dead(struct btrfs_root *root)
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
@@ -3012,51 +3017,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-static inline void btrfs_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}
static inline void btrfs_set_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}
-static inline void btrfs_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}
static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}
-static inline void btrfs_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
static inline void
btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- struct btrfs_disk_balance_args *disk)
+ const struct btrfs_disk_balance_args *disk)
{
memset(cpu, 0, sizeof(*cpu));
@@ -3076,7 +3081,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
static inline void
btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
- struct btrfs_balance_args *cpu)
+ const struct btrfs_balance_args *cpu)
{
memset(disk, 0, sizeof(*disk));
@@ -3144,7 +3149,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/*
@@ -3158,6 +3163,21 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
return offsetof(struct btrfs_leaf, items);
}
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(const struct btrfs_root *root,
+ const struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(root);
+ return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
/* struct btrfs_file_extent_item */
BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
@@ -3174,7 +3194,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
struct btrfs_file_extent_item, compression, 8);
static inline unsigned long
-btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
{
return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
@@ -3208,8 +3228,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
* size of any extent headers. If a file is compressed on disk, this is
* the compressed size
*/
-static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
- struct btrfs_item *e)
+static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+ struct btrfs_item *e)
{
return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
@@ -3217,9 +3238,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
/* this returns the number of file bytes represented by the inline item.
* If an item is compressed, this is the uncompressed size
*/
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
- int slot,
- struct btrfs_file_extent_item *fi)
+static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb,
+ int slot,
+ const struct btrfs_file_extent_item *fi)
{
struct btrfs_map_token token;
@@ -3241,8 +3262,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
/* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
- struct btrfs_dev_stats_item *ptr,
+static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+ const struct btrfs_dev_stats_item *ptr,
int index)
{
u64 val;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 176a27bc63aa..81e5bc62e8e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -620,7 +620,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1f21c6c33228..f80a0af68736 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "tree-checker.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -522,72 +523,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
return ret;
}
-#define CORRUPT(reason, eb, root, slot) \
- btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \
- "root=%llu, slot=%d", reason, \
- btrfs_header_bytenr(eb), root->objectid, slot)
-
-static noinline int check_leaf(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- struct btrfs_key key;
- struct btrfs_key leaf_key;
- u32 nritems = btrfs_header_nritems(leaf);
- int slot;
-
- if (nritems == 0)
- return 0;
-
- /* Check the 0 item */
- if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
- BTRFS_LEAF_DATA_SIZE(root)) {
- CORRUPT("invalid item offset size pair", leaf, root, 0);
- return -EIO;
- }
-
- /*
- * Check to make sure each items keys are in the correct order and their
- * offsets make sense. We only have to loop through nritems-1 because
- * we check the current slot against the next slot, which verifies the
- * next slot's offset+size makes sense and that the current's slot
- * offset is correct.
- */
- for (slot = 0; slot < nritems - 1; slot++) {
- btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &key, slot + 1);
-
- /* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
- CORRUPT("bad key order", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Make sure the offset and ends are right, remember that the
- * item data starts at the end of the leaf and grows towards the
- * front.
- */
- if (btrfs_item_offset_nr(leaf, slot) !=
- btrfs_item_end_nr(leaf, slot + 1)) {
- CORRUPT("slot offset bad", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Check to make sure that we don't point outside of the leaf,
- * just incase all the items are consistent to eachother, but
- * all point outside of the leaf.
- */
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(root)) {
- CORRUPT("slot end outside of leaf", leaf, root, slot);
- return -EIO;
- }
- }
-
- return 0;
-}
-
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -654,11 +589,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && check_leaf(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
+ if (found_level > 0 && btrfs_check_node(root, eb))
+ ret = -EIO;
+
if (!ret)
set_extent_buffer_uptodate(eb);
err:
@@ -3958,7 +3896,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->len,
root->fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ /*
+ * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+ * but item data not updated.
+ * So here we should only check item pointers, not item data.
+ */
+ if (btrfs_header_level(buf) == 0 &&
+ btrfs_check_leaf_relaxed(root, buf)) {
btrfs_print_leaf(root, buf);
ASSERT(0);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 13ff0fdae03e..978bbfed5a2c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2342,7 +2342,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
}
- BUG_ON(node->ref_mod != 1);
+ if (node->ref_mod != 1) {
+ btrfs_err(root->fs_info,
+ "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ node->bytenr, node->ref_mod, node->action, ref_root,
+ parent);
+ return -EIO;
+ }
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, root,
@@ -9481,6 +9487,8 @@ static int find_first_block_group(struct btrfs_root *root,
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+ struct btrfs_block_group_item bg;
+ u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
@@ -9502,7 +9510,47 @@ static int find_first_block_group(struct btrfs_root *root,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, found_key.objectid,
+ found_key.offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(root->fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ found_key.objectid, found_key.offset);
+ ret = -ENOENT;
+ } else if (em->start != found_key.objectid ||
+ em->len != found_key.offset) {
+ btrfs_err(root->fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+ ret = -EUCLEAN;
+ } else {
+ read_extent_buffer(leaf, &bg,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(root->fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ found_key.objectid,
+ found_key.offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK &
+ em->map_lookup->type));
+ ret = -EUCLEAN;
+ } else {
+ ret = 0;
+ }
+ }
+ free_extent_map(em);
goto out;
}
path->slots[0]++;
@@ -9717,6 +9765,62 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
return cache;
}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_cache *bg;
+ u64 start = 0;
+ int ret = 0;
+
+ while (1) {
+ read_lock(&map_tree->map_tree.lock);
+ /*
+ * lookup_extent_mapping will return the first extent map
+ * intersecting the range, so setting @len to 1 is enough to
+ * get the first chunk.
+ */
+ em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
+ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ break;
+
+ bg = btrfs_lookup_block_group(fs_info, em->start);
+ if (!bg) {
+ btrfs_err(fs_info,
+ "chunk start=%llu len=%llu doesn't have corresponding block group",
+ em->start, em->len);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ break;
+ }
+ if (bg->key.objectid != em->start ||
+ bg->key.offset != em->len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+ em->start, em->len,
+ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ bg->key.objectid, bg->key.offset,
+ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ start = em->start + em->len;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ }
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -9903,7 +10007,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
init_global_block_rsv(info);
- ret = 0;
+ ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
return ret;
@@ -10388,7 +10492,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
num_items = 3 + map->num_stripes;
free_extent_map(em);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7de8d545f4d6..573c9d62cfc9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3847,8 +3847,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
+ u32 nritems;
unsigned long i, num_pages;
unsigned long bio_flags = 0;
+ unsigned long start, end;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
@@ -3858,6 +3860,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
bio_flags = EXTENT_BIO_TREE_LOG;
+ /* set btree blocks beyond nritems with 0 to avoid stale content. */
+ nritems = btrfs_header_nritems(eb);
+ if (btrfs_header_level(eb) > 0) {
+ end = btrfs_node_key_ptr_offset(nritems);
+
+ memset_extent_buffer(eb, 0, end, eb->len - end);
+ } else {
+ /*
+ * leaf:
+ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+ */
+ start = btrfs_item_nr_offset(nritems);
+ end = btrfs_leaf_data(eb) +
+ leaf_data_end(fs_info->tree_root, eb);
+ memset_extent_buffer(eb, 0, start, end - start);
+ }
+
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -5351,9 +5370,8 @@ unlock_exit:
return ret;
}
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
- unsigned long start,
- unsigned long len)
+void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5382,9 +5400,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
- unsigned long start,
- unsigned long len)
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5419,10 +5437,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
return ret;
}
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len, char **map,
- unsigned long *map_start,
- unsigned long *map_len)
+int map_private_extent_buffer(const struct extent_buffer *eb,
+ unsigned long start, unsigned long min_len,
+ char **map, unsigned long *map_start,
+ unsigned long *map_len)
{
size_t offset = start & (PAGE_CACHE_SIZE - 1);
char *kaddr;
@@ -5457,9 +5475,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
return 0;
}
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
- unsigned long start,
- unsigned long len)
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae11855f..751435967724 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -308,14 +308,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
atomic_inc(&eb->refs);
}
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
- unsigned long start,
- unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len);
+void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
- unsigned long start,
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
@@ -334,10 +333,10 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
- unsigned long min_len, char **map,
- unsigned long *map_start,
- unsigned long *map_len);
+int map_private_extent_buffer(const struct extent_buffer *eb,
+ unsigned long offset, unsigned long min_len,
+ char **map, unsigned long *map_start,
+ unsigned long *map_len);
int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..84fb56d5c018 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->bdev);
+ kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- struct block_device *bdev;
+ union {
+ struct block_device *bdev;
+
+ /*
+ * used for chunk mappings
+ * flags & EXTENT_FLAG_FS_MAPPING must be set
+ */
+ struct map_lookup *map_lookup;
+ };
atomic_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dca9f937bf6..cc9ccc42f469 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3460,7 +3460,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (em->start != chunk_offset)
goto out;
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index b976597b0721..63ffd213b0b7 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
- unsigned long off, \
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off, \
struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)ptr; \
@@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
return res; \
} \
void btrfs_set_token_##bits(struct extent_buffer *eb, \
- void *ptr, unsigned long off, u##bits val, \
+ const void *ptr, unsigned long off, \
+ u##bits val, \
struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)ptr; \
@@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16)
DEFINE_BTRFS_SETGET_BITS(32)
DEFINE_BTRFS_SETGET_BITS(64)
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr = btrfs_node_key_ptr_offset(nr);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..5b98f3c76ce4
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,649 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+#include "hash.h"
+#include "volumes.h"
+
+#define CORRUPT(reason, eb, root, slot) \
+ btrfs_crit(root->fs_info, \
+ "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", \
+ reason, btrfs_header_bytenr(eb), root->objectid, slot)
+
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type: leaf or node
+ * @identifier: the necessary info to locate the leaf/node.
+ * It's recommened to decode key.objecitd/offset if it's
+ * meaningful.
+ * @reason: describe the error
+ * @bad_value: optional, it's recommened to output bad value and its
+ * expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+ va_end(args);
+}
+
+static int check_extent_data_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = root->sectorsize;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ CORRUPT("unaligned key offset for file extent",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ CORRUPT("invalid file extent type", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Support for new compression/encrption must introduce incompat flag,
+ * and must be caught in open_ctree().
+ */
+ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ CORRUPT("invalid file extent compression", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_encryption(leaf, fi)) {
+ CORRUPT("invalid file extent encryption", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ /* Inline extent must have 0 as key offset */
+ if (key->offset) {
+ CORRUPT("inline extent has non-zero key offset",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /* Compressed inline extent has no on-disk size, skip it */
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE)
+ return 0;
+
+ /* Uncompressed inline extent size must match item size */
+ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi)) {
+ CORRUPT("plaintext inline extent has invalid size",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /* Regular or preallocated extent has fixed item size */
+ if (item_size != sizeof(*fi)) {
+ CORRUPT(
+ "regluar or preallocated extent data item size is invalid",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+ CORRUPT(
+ "regular or preallocated extent data item has unaligned value",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ u32 sectorsize = root->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ CORRUPT("invalid objectid for csum item", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ CORRUPT("unaligned csum item size", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+ btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 cur = 0;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+
+ /* header itself should not cross item boundary */
+ if (cur + sizeof(*di) > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (dir_type >= BTRFS_FT_MAX) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY) {
+ dir_item_err(root, leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (name_len > max_name_len) {
+ dir_item_err(root, leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+ dir_item_err(root, leaf, slot,
+ "dir item name and data len too long, have %u max %zu",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(root));
+ return -EUCLEAN;
+ }
+
+ if (data_len && dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (cur + total_size > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (key->offset != name_hash) {
+ dir_item_err(root, leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+__printf(4, 5)
+__cold
+static void block_group_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+static int check_block_group_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_block_group_item bgi;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u64 flags;
+ u64 type;
+
+ /*
+ * Here we don't really care about alignment since extent allocator can
+ * handle it. We care more about the size, as if one block group is
+ * larger than maximum size, it's must be some obvious corruption.
+ */
+ if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group size, have %llu expect (0, %llu]",
+ key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+ return -EUCLEAN;
+ }
+
+ if (item_size != sizeof(bgi)) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid item size, have %u expect %zu",
+ item_size, sizeof(bgi));
+ return -EUCLEAN;
+ }
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ if (btrfs_block_group_chunk_objectid(&bgi) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group chunk objectid, have %llu expect %llu",
+ btrfs_block_group_chunk_objectid(&bgi),
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_block_group_used(&bgi) > key->offset) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group used, have %llu expect [0, %llu)",
+ btrfs_block_group_used(&bgi), key->offset);
+ return -EUCLEAN;
+ }
+
+ flags = btrfs_block_group_flags(&bgi);
+ if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ block_group_err(fs_info, leaf, slot,
+"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+ flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
+ return -EUCLEAN;
+ }
+
+ type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)) {
+ block_group_err(fs_info, leaf, slot,
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+ type, hweight64(type),
+ BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ int ret = 0;
+
+ switch (key->type) {
+ case BTRFS_EXTENT_DATA_KEY:
+ ret = check_extent_data_item(root, leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ ret = check_csum_item(root, leaf, key, slot);
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(root, leaf, key, slot);
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ ret = check_block_group_item(root->fs_info, leaf, key, slot);
+ break;
+ }
+ return ret;
+}
+
+static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+ bool check_item_data)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ /* No valid key type is 0, so all key should be larger than this key */
+ struct btrfs_key prev_key = {0, 0, 0};
+ struct btrfs_key key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(root, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So
+ * skip this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ u64 owner = btrfs_header_owner(leaf);
+ struct btrfs_root *check_root;
+
+ /* These trees must never be empty */
+ if (owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_EXTENT_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ generic_err(root, leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+ key.objectid = owner;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ struct extent_buffer *eb;
+
+ eb = btrfs_root_node(check_root);
+ /* if leaf is the root, then it's fine */
+ if (leaf != eb) {
+ CORRUPT("non-root leaf's nritems is 0",
+ leaf, check_root, 0);
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ free_extent_buffer(eb);
+ }
+ return 0;
+ }
+
+ if (nritems == 0)
+ return 0;
+
+ /*
+ * Check the following things to make sure this is a good leaf, and
+ * leaf users won't need to bother with similar sanity checks:
+ *
+ * 1) key ordering
+ * 2) item offset and size
+ * No overlap, no hole, all inside the leaf.
+ * 3) item content
+ * If possible, do comprehensive sanity check.
+ * NOTE: All checks must only rely on the item data itself.
+ */
+ for (slot = 0; slot < nritems; slot++) {
+ u32 item_end_expected;
+ int ret;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(root);
+ else
+ item_end_expected = btrfs_item_offset_nr(leaf,
+ slot - 1);
+ if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /* Also check if the item pointer overlaps with btrfs item. */
+ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+ btrfs_item_ptr_offset(leaf, slot)) {
+ CORRUPT("slot overlap with its data", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ if (check_item_data) {
+ /*
+ * Check if the item size and content meet other
+ * criteria
+ */
+ ret = check_leaf_item(root, leaf, &key, slot);
+ if (ret < 0)
+ return ret;
+ }
+
+ prev_key.objectid = key.objectid;
+ prev_key.type = key.type;
+ prev_key.offset = key.offset;
+ }
+
+ return 0;
+}
+
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, false);
+}
+
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+ struct btrfs_key key, next_key;
+ int slot;
+ int level = btrfs_header_level(node);
+ u64 bytenr;
+ int ret = 0;
+
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(root, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+ btrfs_crit(root->fs_info,
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%zu]",
+ root->objectid, node->start,
+ nr == 0 ? "small" : "large", nr,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
+ return -EUCLEAN;
+ }
+
+ for (slot = 0; slot < nr - 1; slot++) {
+ bytenr = btrfs_node_blockptr(node, slot);
+ btrfs_node_key_to_cpu(node, &key, slot);
+ btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+ if (!bytenr) {
+ generic_err(root, node, slot,
+ "invalid NULL node pointer");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (!IS_ALIGNED(bytenr, root->sectorsize)) {
+ generic_err(root, node, slot,
+ "unaligned pointer, have %llu should be aligned to %u",
+ bytenr, root->sectorsize);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ generic_err(root, node, slot,
+ "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+ key.objectid, key.type, key.offset,
+ next_key.objectid, next_key.type,
+ next_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..3d53e8d6fda0
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b4d63a9842fa..5e8fe8f3942d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1184,7 +1184,7 @@ again:
struct map_lookup *map;
int i;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
u64 end;
@@ -2757,7 +2757,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
lock_chunks(root->fs_info->chunk_root);
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
@@ -4540,7 +4540,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
- max_chunk_size = 10 * max_stripe_size;
+ max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
@@ -4731,7 +4731,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
goto error;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
@@ -4826,7 +4826,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
@@ -4968,7 +4968,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->missing) {
miss_ndevs++;
@@ -5048,7 +5048,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5091,7 +5091,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
@@ -5112,7 +5112,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
@@ -5271,7 +5271,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
offset = logical - em->start;
stripe_len = map->stripe_len;
@@ -5813,7 +5813,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
free_extent_map(em);
return -EIO;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
@@ -6208,6 +6208,101 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
return dev;
}
+/* Return -EIO if any error, otherwise return 0. */
+static int btrfs_check_chunk_valid(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
+{
+ u64 length;
+ u64 stripe_len;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
+ u64 features;
+ bool mixed = false;
+
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+
+ if (!num_stripes) {
+ btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+ num_stripes);
+ return -EIO;
+ }
+ if (!IS_ALIGNED(logical, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk logical %llu", logical);
+ return -EIO;
+ }
+ if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
+ btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+ btrfs_chunk_sector_size(leaf, chunk));
+ return -EIO;
+ }
+ if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk length %llu", length);
+ return -EIO;
+ }
+ if (!is_power_of_2(stripe_len)) {
+ btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EIO;
+ }
+ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ type) {
+ btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EIO;
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ btrfs_err(root->fs_info, "missing chunk type flag: 0x%llx", type);
+ return -EIO;
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(root->fs_info,
+ "system chunk with data or metadata type: 0x%llx", type);
+ return -EIO;
+ }
+
+ features = btrfs_super_incompat_flags(root->fs_info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = true;
+
+ if (!mixed) {
+ if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA)) {
+ btrfs_err(root->fs_info,
+ "mixed chunk type in non-mixed mode: 0x%llx", type);
+ return -EIO;
+ }
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1)) {
+ btrfs_err(root->fs_info,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EIO;
+ }
+
+ return 0;
+}
+
static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
@@ -6217,6 +6312,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_map *em;
u64 logical;
u64 length;
+ u64 stripe_len;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
@@ -6225,6 +6321,12 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+ if (ret)
+ return ret;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6241,7 +6343,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em = alloc_extent_map();
if (!em)
return -ENOMEM;
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
@@ -6249,7 +6350,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
@@ -6473,6 +6574,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
u32 array_size;
u32 len = 0;
u32 cur_offset;
+ u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
@@ -6539,6 +6641,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
break;
}
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(root->fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6948,7 +7059,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
/* In order to kick the device replace finish process */
lock_chunks(root);
list_for_each_entry(em, &transaction->pending_chunks, list) {
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d5c84f6b1353..3c651df420be 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
+
extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN (64 * 1024)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 79a1bad88931..7b00727e1c11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1073,10 +1073,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf) {
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
free_xid(xid);
return -EINVAL;
}
@@ -1404,10 +1404,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
return -EINVAL;
max_num = (max_buf - sizeof(struct smb_hdr)) /
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b2aff0c6f22c..b7885dc0d9bb 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -123,10 +123,10 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < sizeof(struct smb2_lock_element))
return -EINVAL;
max_num = max_buf / sizeof(struct smb2_lock_element);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 54af10204e83..1cf0a336ec06 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -360,7 +360,7 @@ uncork:
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else
+ else if (rc > 0)
rc = 0;
return rc;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 52b3484f5703..ec506c2733ee 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1873,12 +1873,12 @@ int ext4_inline_data_fiemap(struct inode *inode,
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
brelse(iloc.bh);
out:
up_read(&EXT4_I(inode)->xattr_sem);
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, start, physical,
+ inline_len, flags);
return (error < 0 ? error : 0);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ae3691bd279b..84b3ee71d175 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -142,6 +142,8 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
static void f2fs_read_end_io(struct bio *bio)
{
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_READ_IO)) {
f2fs_show_injection_info(FAULT_READ_IO);
bio->bi_error = -EIO;
@@ -155,6 +157,13 @@ static void f2fs_read_end_io(struct bio *bio)
return;
}
+ if (first_page != NULL &&
+ __read_io_type(first_page) == F2FS_RD_DATA) {
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+ }
+
__read_end_io(bio);
}
@@ -321,6 +330,32 @@ submit_io:
submit_bio(bio_op(bio), bio);
}
+static void __f2fs_submit_read_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
+{
+ if (trace_android_fs_dataread_start_enabled() && (type == DATA)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL &&
+ __read_io_type(first_page) == F2FS_RD_DATA) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
+ __submit_bio(sbi, bio, type);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -468,7 +503,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page): WB_DATA_TYPE(fio->page));
- __submit_bio(fio->sbi, bio, fio->type);
+ __f2fs_submit_read_bio(fio->sbi, bio, fio->type);
return 0;
}
@@ -598,7 +633,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
}
ClearPageError(page);
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
@@ -1597,7 +1632,7 @@ got_it:
if (bio && (last_block_in_bio != block_nr - 1 ||
!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
submit_and_realloc:
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
@@ -1629,7 +1664,7 @@ set_error_page:
goto next_page;
confused:
if (bio) {
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
unlock_page(page);
@@ -1639,7 +1674,7 @@ next_page:
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h
index 519b6e2d769e..661e5c2a8e2a 100644
--- a/include/linux/hashtable.h
+++ b/include/linux/hashtable.h
@@ -16,6 +16,10 @@
struct hlist_head name[1 << (bits)] = \
{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
+#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \
+ struct hlist_head name[1 << (bits)] __read_mostly = \
+ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
+
#define DECLARE_HASHTABLE(name, bits) \
struct hlist_head name[1 << (bits)]
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index cc0fc712bb82..a8ac3f25b4ec 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -290,9 +290,12 @@ struct svc_rqst {
struct svc_cacherep * rq_cacherep; /* cache info */
struct task_struct *rq_task; /* service thread */
spinlock_t rq_lock; /* per-request lock */
+ struct net *rq_bc_net; /* pointer to backchannel's
+ * net namespace
+ */
};
-#define SVC_NET(svc_rqst) (svc_rqst->rq_xprt->xpt_net)
+#define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
/*
* Rigorous type checking on sockaddr type conversions
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index e5ce8ab0b8b0..26c155bb639b 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -70,7 +70,7 @@ struct virtio_config_ops {
int (*find_vqs)(struct virtio_device *, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
- const char *names[]);
+ const char * const names[]);
void (*del_vqs)(struct virtio_device *);
u64 (*get_features)(struct virtio_device *vdev);
int (*finalize_features)(struct virtio_device *vdev);
@@ -149,6 +149,19 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
return __virtio_test_bit(vdev, fbit);
}
+/**
+ * virtio_has_iommu_quirk - determine whether this device has the iommu quirk
+ * @vdev: the device
+ */
+static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev)
+{
+ /*
+ * Note the reverse polarity of the quirk feature (compared to most
+ * other features), this is for compatibility with legacy systems.
+ */
+ return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+}
+
static inline
struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
vq_callback_t *c, const char *n)
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000000000000..584f9a647ad4
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,157 @@
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include <uapi/linux/virtio_vsock.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64)
+
+enum {
+ VSOCK_VQ_RX = 0, /* for host to guest data */
+ VSOCK_VQ_TX = 1, /* for guest to host data */
+ VSOCK_VQ_EVENT = 2,
+ VSOCK_VQ_MAX = 3,
+};
+
+/* Per-socket state (accessed via vsk->trans) */
+struct virtio_vsock_sock {
+ struct vsock_sock *vsk;
+
+ /* Protected by lock_sock(sk_vsock(trans->vsk)) */
+ u32 buf_size;
+ u32 buf_size_min;
+ u32 buf_size_max;
+
+ spinlock_t tx_lock;
+ spinlock_t rx_lock;
+
+ /* Protected by tx_lock */
+ u32 tx_cnt;
+ u32 buf_alloc;
+ u32 peer_fwd_cnt;
+ u32 peer_buf_alloc;
+
+ /* Protected by rx_lock */
+ u32 fwd_cnt;
+ u32 rx_bytes;
+ struct list_head rx_queue;
+};
+
+struct virtio_vsock_pkt {
+ struct virtio_vsock_hdr hdr;
+ struct work_struct work;
+ struct list_head list;
+ /* socket refcnt not held, only use for cancellation */
+ struct vsock_sock *vsk;
+ void *buf;
+ u32 len;
+ u32 off;
+ bool reply;
+};
+
+struct virtio_vsock_pkt_info {
+ u32 remote_cid, remote_port;
+ struct vsock_sock *vsk;
+ struct msghdr *msg;
+ u32 pkt_len;
+ u16 type;
+ u16 op;
+ u32 flags;
+ bool reply;
+};
+
+struct virtio_transport {
+ /* This must be the first field */
+ struct vsock_transport transport;
+
+ /* Takes ownership of the packet */
+ int (*send_pkt)(struct virtio_vsock_pkt *pkt);
+};
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len,
+ int type);
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk);
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now);
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+ size_t target,
+ bool *space_available_now);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+ size_t target, ssize_t copied, bool data_read,
+ struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+ ssize_t written, struct vsock_transport_send_notify_data *data);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
+bool virtio_transport_stream_allow(u32 cid, u32 port);
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr);
+bool virtio_transport_dgram_allow(u32 cid, u32 port);
+
+int virtio_transport_connect(struct vsock_sock *vsk);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
+
+void virtio_transport_release(struct vsock_sock *vsk);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len);
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t len);
+
+void virtio_transport_destruct(struct vsock_sock *vsk);
+
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+
+#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f7a35fcaaaf6..f38fe1c00564 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -64,6 +64,8 @@ struct vsock_sock {
bool rejected;
struct delayed_work connect_work;
struct delayed_work pending_work;
+ struct delayed_work close_work;
+ bool close_work_scheduled;
u32 peer_shutdown;
bool sent_request;
bool ignore_connecting_rst;
@@ -98,6 +100,9 @@ struct vsock_transport {
void (*destruct)(struct vsock_sock *);
void (*release)(struct vsock_sock *);
+ /* Cancel all pending packets sent on vsock. */
+ int (*cancel_pkt)(struct vsock_sock *vsk);
+
/* Connections. */
int (*connect)(struct vsock_sock *);
@@ -165,6 +170,9 @@ static inline int vsock_core_init(const struct vsock_transport *t)
}
void vsock_core_exit(void);
+/* The transport may downcast this to access transport-specific functions */
+const struct vsock_transport *vsock_core_get_transport(void);
+
/**** UTILS ****/
void vsock_release_pending(struct sock *pending);
@@ -177,6 +185,7 @@ void vsock_remove_connected(struct vsock_sock *vsk);
struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
struct sockaddr_vm *dst);
+void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
#endif /* __AF_VSOCK_H__ */
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
new file mode 100644
index 000000000000..b7f1d6278280
--- /dev/null
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -0,0 +1,144 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsock
+
+#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \
+ defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H
+
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+
+#define show_type(val) \
+ __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST);
+
+#define show_op(val) \
+ __print_symbolic(val, \
+ { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \
+ { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \
+ { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \
+ { VIRTIO_VSOCK_OP_RST, "RST" }, \
+ { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \
+ { VIRTIO_VSOCK_OP_RW, "RW" }, \
+ { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \
+ { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" })
+
+TRACE_EVENT(virtio_transport_alloc_pkt,
+ TP_PROTO(
+ __u32 src_cid, __u32 src_port,
+ __u32 dst_cid, __u32 dst_port,
+ __u32 len,
+ __u16 type,
+ __u16 op,
+ __u32 flags
+ ),
+ TP_ARGS(
+ src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ type,
+ op,
+ flags
+ ),
+ TP_STRUCT__entry(
+ __field(__u32, src_cid)
+ __field(__u32, src_port)
+ __field(__u32, dst_cid)
+ __field(__u32, dst_port)
+ __field(__u32, len)
+ __field(__u16, type)
+ __field(__u16, op)
+ __field(__u32, flags)
+ ),
+ TP_fast_assign(
+ __entry->src_cid = src_cid;
+ __entry->src_port = src_port;
+ __entry->dst_cid = dst_cid;
+ __entry->dst_port = dst_port;
+ __entry->len = len;
+ __entry->type = type;
+ __entry->op = op;
+ __entry->flags = flags;
+ ),
+ TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+ __entry->src_cid, __entry->src_port,
+ __entry->dst_cid, __entry->dst_port,
+ __entry->len,
+ show_type(__entry->type),
+ show_op(__entry->op),
+ __entry->flags)
+);
+
+TRACE_EVENT(virtio_transport_recv_pkt,
+ TP_PROTO(
+ __u32 src_cid, __u32 src_port,
+ __u32 dst_cid, __u32 dst_port,
+ __u32 len,
+ __u16 type,
+ __u16 op,
+ __u32 flags,
+ __u32 buf_alloc,
+ __u32 fwd_cnt
+ ),
+ TP_ARGS(
+ src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ type,
+ op,
+ flags,
+ buf_alloc,
+ fwd_cnt
+ ),
+ TP_STRUCT__entry(
+ __field(__u32, src_cid)
+ __field(__u32, src_port)
+ __field(__u32, dst_cid)
+ __field(__u32, dst_port)
+ __field(__u32, len)
+ __field(__u16, type)
+ __field(__u16, op)
+ __field(__u32, flags)
+ __field(__u32, buf_alloc)
+ __field(__u32, fwd_cnt)
+ ),
+ TP_fast_assign(
+ __entry->src_cid = src_cid;
+ __entry->src_port = src_port;
+ __entry->dst_cid = dst_cid;
+ __entry->dst_port = dst_port;
+ __entry->len = len;
+ __entry->type = type;
+ __entry->op = op;
+ __entry->flags = flags;
+ __entry->buf_alloc = buf_alloc;
+ __entry->fwd_cnt = fwd_cnt;
+ ),
+ TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x "
+ "buf_alloc=%u fwd_cnt=%u",
+ __entry->src_cid, __entry->src_port,
+ __entry->dst_cid, __entry->dst_port,
+ __entry->len,
+ show_type(__entry->type),
+ show_op(__entry->op),
+ __entry->flags,
+ __entry->buf_alloc,
+ __entry->fwd_cnt)
+);
+
+#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE vsock_virtio_transport_common
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index c06237170542..01269fa43e77 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -508,6 +508,7 @@ header-y += virtio_ring.h
header-y += virtio_rng.h
header-y += virtio_scsi.h
header-y += virtio_types.h
+header-y += virtio_vsock.h
header-y += vm_sockets.h
header-y += vt.h
header-y += wait.h
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab3731917bac..56b7ab584cc0 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -47,6 +47,32 @@ struct vhost_vring_addr {
__u64 log_guest_addr;
};
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+ __u64 iova;
+ __u64 size;
+ __u64 uaddr;
+#define VHOST_ACCESS_RO 0x1
+#define VHOST_ACCESS_WO 0x2
+#define VHOST_ACCESS_RW 0x3
+ __u8 perm;
+#define VHOST_IOTLB_MISS 1
+#define VHOST_IOTLB_UPDATE 2
+#define VHOST_IOTLB_INVALIDATE 3
+#define VHOST_IOTLB_ACCESS_FAIL 4
+ __u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+ int type;
+ union {
+ struct vhost_iotlb_msg iotlb;
+ __u8 padding[64];
+ };
+};
+
struct vhost_memory_region {
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
@@ -126,6 +152,12 @@ struct vhost_memory {
#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
/* Set eventfd to signal an error */
#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+/* Set busy loop timeout (in us) */
+#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \
+ struct vhost_vring_state)
+/* Get busy loop timeout (in us) */
+#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \
+ struct vhost_vring_state)
/* VHOST_NET specific defines */
@@ -140,6 +172,8 @@ struct vhost_memory {
#define VHOST_F_LOG_ALL 26
/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
#define VHOST_NET_F_VIRTIO_NET_HDR 27
+/* Vhost have device IOTLB */
+#define VHOST_F_DEVICE_IOTLB 63
/* VHOST_SCSI specific definitions */
@@ -169,4 +203,9 @@ struct vhost_scsi_target {
#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
+/* VHOST_VSOCK specific defines */
+
+#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
+#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
+
#endif
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index c18264df9504..cf49c7e2cfdb 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -47,7 +47,7 @@
* transport being used (eg. virtio_ring), the rest are per-device feature
* bits. */
#define VIRTIO_TRANSPORT_F_START 28
-#define VIRTIO_TRANSPORT_F_END 33
+#define VIRTIO_TRANSPORT_F_END 34
#ifndef VIRTIO_CONFIG_NO_LEGACY
/* Do we get callbacks when the ring is completely used, even if we've
@@ -61,4 +61,12 @@
/* v1.0 compliant. */
#define VIRTIO_F_VERSION_1 32
+/*
+ * If clear - device has the IOMMU bypass quirk feature.
+ * If set - use platform tools to detect the IOMMU.
+ *
+ * Note the reverse polarity (compared to most other features),
+ * this is for compatibility with legacy systems.
+ */
+#define VIRTIO_F_IOMMU_PLATFORM 33
#endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f587b15..3228d582234a 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
#define VIRTIO_ID_CAIF 12 /* Virtio caif */
#define VIRTIO_ID_GPU 16 /* virtio GPU */
#define VIRTIO_ID_INPUT 18 /* virtio input */
+#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
#endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
new file mode 100644
index 000000000000..1d57ed3d84d2
--- /dev/null
+++ b/include/uapi/linux/virtio_vsock.h
@@ -0,0 +1,94 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
+#define _UAPI_LINUX_VIRTIO_VSOCK_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_vsock_config {
+ __le64 guest_cid;
+} __attribute__((packed));
+
+enum virtio_vsock_event_id {
+ VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0,
+};
+
+struct virtio_vsock_event {
+ __le32 id;
+} __attribute__((packed));
+
+struct virtio_vsock_hdr {
+ __le64 src_cid;
+ __le64 dst_cid;
+ __le32 src_port;
+ __le32 dst_port;
+ __le32 len;
+ __le16 type; /* enum virtio_vsock_type */
+ __le16 op; /* enum virtio_vsock_op */
+ __le32 flags;
+ __le32 buf_alloc;
+ __le32 fwd_cnt;
+} __attribute__((packed));
+
+enum virtio_vsock_type {
+ VIRTIO_VSOCK_TYPE_STREAM = 1,
+};
+
+enum virtio_vsock_op {
+ VIRTIO_VSOCK_OP_INVALID = 0,
+
+ /* Connect operations */
+ VIRTIO_VSOCK_OP_REQUEST = 1,
+ VIRTIO_VSOCK_OP_RESPONSE = 2,
+ VIRTIO_VSOCK_OP_RST = 3,
+ VIRTIO_VSOCK_OP_SHUTDOWN = 4,
+
+ /* To send payload */
+ VIRTIO_VSOCK_OP_RW = 5,
+
+ /* Tell the peer our credit info */
+ VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
+ /* Request the peer to send the credit info to us */
+ VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+};
+
+/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
+enum virtio_vsock_shutdown {
+ VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
+ VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
+};
+
+#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/mm/slab.c b/mm/slab.c
index 80ca19a122f3..6180cba0df78 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -859,8 +859,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 4810e43501c6..01012cf847a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -185,7 +185,7 @@ EXPORT_SYMBOL(dev_base_lock);
static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..41f6e964fe91 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1062,6 +1062,8 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
#endif
+extern void svc_tcp_prep_reply_hdr(struct svc_rqst *);
+
/*
* Common routine for processing the RPC request.
*/
@@ -1091,7 +1093,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
clear_bit(RQ_DROPME, &rqstp->rq_flags);
/* Setup reply header */
- rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+ if (rqstp->rq_prot == IPPROTO_TCP)
+ svc_tcp_prep_reply_hdr(rqstp);
svc_putu32(resv, rqstp->rq_xid);
@@ -1138,7 +1141,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ if (rqstp->rq_xprt &&
+ test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
svc_close_xprt(rqstp->rq_xprt);
case SVC_DROP:
goto dropit;
@@ -1360,10 +1364,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
dprintk("svc: %s(%p)\n", __func__, req);
/* Build the svc_rqst used by the common processing routine */
- rqstp->rq_xprt = serv->sv_bc_xprt;
rqstp->rq_xid = req->rq_xid;
rqstp->rq_prot = req->rq_xprt->prot;
rqstp->rq_server = serv;
+ rqstp->rq_bc_net = req->rq_xprt->xprt_net;
rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 71f15da72f02..2b8e80c721db 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -454,10 +454,11 @@ out:
*/
void svc_reserve(struct svc_rqst *rqstp, int space)
{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
space += rqstp->rq_res.head[0].iov_len;
- if (space < rqstp->rq_reserved) {
- struct svc_xprt *xprt = rqstp->rq_xprt;
+ if (xprt && space < rqstp->rq_reserved) {
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9701fcca002c..0a9fe033132c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1240,7 +1240,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
/*
* Setup response header. TCP has a 4B record length field.
*/
-static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
{
struct kvec *resv = &rqstp->rq_res.head[0];
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810abedc2e..8831e7c42167 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS
To compile this driver as a module, choose M here: the module
will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+ tristate "virtio transport for Virtual Sockets"
+ depends on VSOCKETS && VIRTIO
+ select VIRTIO_VSOCKETS_COMMON
+ help
+ This module implements a virtio transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine host supports Virtual
+ Sockets over virtio.
+
+ To compile this driver as a module, choose M here: the module will be
+ called vmw_vsock_virtio_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+ tristate
+ help
+ This option is selected by any driver which needs to access
+ the virtio_vsock. The module will be called
+ vmw_vsock_virtio_transport_common.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d70f224..bc27c70e0e59 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,13 @@
obj-$(CONFIG_VSOCKETS) += vsock.o
obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
vsock-y += af_vsock.o vsock_addr.o
vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
vmci_transport_notify_qstate.o
+
+vmw_vsock_virtio_transport-y += virtio_transport.o
+
+vmw_vsock_virtio_transport_common-y += virtio_transport_common.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7f1d166ce612..7566395e526d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
* function will also cleanup rejected sockets, those that reach the connected
* state but leave it before they have been accepted.
*
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ * lock_sock(listener);
+ * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
* - Sockets created by user action will be cleaned up when the user process
* calls close(2), causing our release implementation to be called. Our release
* implementation will perform some cleanup then drop the last reference so our
@@ -336,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
return ret;
}
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
{
int i;
@@ -443,10 +461,12 @@ static void vsock_pending_work(struct work_struct *work)
cleanup = true;
lock_sock(listener);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
+
+ listener->sk_ack_backlog--;
} else if (!vsk->rejected) {
/* We are not on the pending list and accept() did not reject
* us, so we must have been accepted by our user process. We
@@ -457,8 +477,6 @@ static void vsock_pending_work(struct work_struct *work)
goto out;
}
- listener->sk_ack_backlog--;
-
/* We need to remove ourself from the global connected sockets list so
* incoming packets can't find this socket, and to reduce the reference
* count.
@@ -655,12 +673,6 @@ static void __vsock_release(struct sock *sk)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
-
transport->release(vsk);
lock_sock(sk);
@@ -1092,10 +1104,19 @@ static const struct proto_ops vsock_dgram_ops = {
.sendpage = sock_no_sendpage,
};
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ if (!transport->cancel_pkt)
+ return -EOPNOTSUPP;
+
+ return transport->cancel_pkt(vsk);
+}
+
static void vsock_connect_timeout(struct work_struct *work)
{
struct sock *sk;
struct vsock_sock *vsk;
+ int cancel = 0;
vsk = container_of(work, struct vsock_sock, connect_work.work);
sk = sk_vsock(vsk);
@@ -1106,8 +1127,11 @@ static void vsock_connect_timeout(struct work_struct *work)
sk->sk_state = SS_UNCONNECTED;
sk->sk_err = ETIMEDOUT;
sk->sk_error_report(sk);
+ cancel = 1;
}
release_sock(sk);
+ if (cancel)
+ vsock_transport_cancel_pkt(vsk);
sock_put(sk);
}
@@ -1212,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
err = sock_intr_errno(timeout);
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
}
@@ -1293,7 +1319,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
if (connected) {
listener->sk_ack_backlog--;
- lock_sock(connected);
+ lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
/* If the listener socket has received an error, then we should
@@ -1983,7 +2009,16 @@ void vsock_core_exit(void)
}
EXPORT_SYMBOL_GPL(vsock_core_exit);
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+ /* vsock_register_mutex not taken since only the transport uses this
+ * function and only while registered.
+ */
+ return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");
-MODULE_VERSION("1.0.1.0-k");
+MODULE_VERSION("1.0.2.0-k");
MODULE_LICENSE("GPL v2");
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000000000000..936d7eee62d0
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,620 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <net/sock.h>
+#include <linux/mutex.h>
+#include <net/af_vsock.h>
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
+
+struct virtio_vsock {
+ struct virtio_device *vdev;
+ struct virtqueue *vqs[VSOCK_VQ_MAX];
+
+ /* Virtqueue processing is deferred to a workqueue */
+ struct work_struct tx_work;
+ struct work_struct rx_work;
+ struct work_struct event_work;
+
+ /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX]
+ * must be accessed with tx_lock held.
+ */
+ struct mutex tx_lock;
+
+ struct work_struct send_pkt_work;
+ spinlock_t send_pkt_list_lock;
+ struct list_head send_pkt_list;
+
+ atomic_t queued_replies;
+
+ /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
+ * must be accessed with rx_lock held.
+ */
+ struct mutex rx_lock;
+ int rx_buf_nr;
+ int rx_buf_max_nr;
+
+ /* The following fields are protected by event_lock.
+ * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
+ */
+ struct mutex event_lock;
+ struct virtio_vsock_event event_list[8];
+
+ u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+ return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+ struct virtio_vsock *vsock = virtio_vsock_get();
+
+ return vsock->guest_cid;
+}
+
+static void
+virtio_transport_send_pkt_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, send_pkt_work);
+ struct virtqueue *vq;
+ bool added = false;
+ bool restart_rx = false;
+
+ mutex_lock(&vsock->tx_lock);
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ int ret, in_sg = 0, out_sg = 0;
+ bool reply;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ if (list_empty(&vsock->send_pkt_list)) {
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ reply = pkt->reply;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[out_sg++] = &hdr;
+ if (pkt->buf) {
+ sg_init_one(&buf, pkt->buf, pkt->len);
+ sgs[out_sg++] = &buf;
+ }
+
+ ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+ /* Usually this means that there is no more space available in
+ * the vq
+ */
+ if (ret < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ if (reply) {
+ struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ val = atomic_dec_return(&vsock->queued_replies);
+
+ /* Do we now have resources to resume rx processing? */
+ if (val + 1 == virtqueue_get_vring_size(rx_vq))
+ restart_rx = true;
+ }
+
+ added = true;
+ }
+
+ if (added)
+ virtqueue_kick(vq);
+
+ mutex_unlock(&vsock->tx_lock);
+
+ if (restart_rx)
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static int
+virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock *vsock;
+ int len = pkt->len;
+
+ vsock = virtio_vsock_get();
+ if (!vsock) {
+ virtio_transport_free_pkt(pkt);
+ return -ENODEV;
+ }
+
+ if (pkt->reply)
+ atomic_inc(&vsock->queued_replies);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+ return len;
+}
+
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+{
+ int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ struct virtqueue *vq;
+ int ret;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ do {
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ break;
+
+ pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!pkt->buf) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+
+ pkt->len = buf_len;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[0] = &hdr;
+
+ sg_init_one(&buf, pkt->buf, buf_len);
+ sgs[1] = &buf;
+ ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+ if (ret) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ vsock->rx_buf_nr++;
+ } while (vq->num_free);
+ if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+ vsock->rx_buf_max_nr = vsock->rx_buf_nr;
+ virtqueue_kick(vq);
+}
+
+static void virtio_transport_tx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, tx_work);
+ struct virtqueue *vq;
+ bool added = false;
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+ mutex_lock(&vsock->tx_lock);
+ do {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+ virtio_transport_free_pkt(pkt);
+ added = true;
+ }
+ } while (!virtqueue_enable_cb(vq));
+ mutex_unlock(&vsock->tx_lock);
+
+ if (added)
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+}
+
+/* Is there space left for replies to rx packets? */
+static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
+{
+ struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+ val = atomic_read(&vsock->queued_replies);
+
+ return val < virtqueue_get_vring_size(vq);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_recv_pkt(pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
+/* event_lock must be held */
+static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ sg_init_one(&sg, event, sizeof(*event));
+
+ return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) {
+ struct virtio_vsock_event *event = &vsock->event_list[i];
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+}
+
+static void virtio_vsock_reset_sock(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ release_sock(sk);
+}
+
+static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
+{
+ struct virtio_device *vdev = vsock->vdev;
+ u64 guest_cid;
+
+ vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
+ &guest_cid, sizeof(guest_cid));
+ vsock->guest_cid = le64_to_cpu(guest_cid);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_handle(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ switch (le32_to_cpu(event->id)) {
+ case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET:
+ virtio_vsock_update_guest_cid(vsock);
+ vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+ break;
+ }
+}
+
+static void virtio_transport_event_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, event_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ mutex_lock(&vsock->event_lock);
+
+ do {
+ struct virtio_vsock_event *event;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((event = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len == sizeof(*event))
+ virtio_vsock_event_handle(vsock, event);
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+
+ mutex_unlock(&vsock->event_lock);
+}
+
+static void virtio_vsock_event_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->event_work);
+}
+
+static void virtio_vsock_tx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->tx_work);
+}
+
+static void virtio_vsock_rx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static struct virtio_transport virtio_transport = {
+ .transport = {
+ .get_local_cid = virtio_transport_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+ .set_buffer_size = virtio_transport_set_buffer_size,
+ .set_min_buffer_size = virtio_transport_set_min_buffer_size,
+ .set_max_buffer_size = virtio_transport_set_max_buffer_size,
+ .get_buffer_size = virtio_transport_get_buffer_size,
+ .get_min_buffer_size = virtio_transport_get_min_buffer_size,
+ .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ },
+
+ .send_pkt = virtio_transport_send_pkt,
+};
+
+static int virtio_vsock_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *callbacks[] = {
+ virtio_vsock_rx_done,
+ virtio_vsock_tx_done,
+ virtio_vsock_event_done,
+ };
+ static const char * const names[] = {
+ "rx",
+ "tx",
+ "event",
+ };
+ struct virtio_vsock *vsock = NULL;
+ int ret;
+
+ ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
+ if (ret)
+ return ret;
+
+ /* Only one virtio-vsock device per guest is supported */
+ if (the_virtio_vsock) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
+ if (!vsock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vsock->vdev = vdev;
+
+ ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names);
+ if (ret < 0)
+ goto out;
+
+ virtio_vsock_update_guest_cid(vsock);
+
+ ret = vsock_core_init(&virtio_transport.transport);
+ if (ret < 0)
+ goto out_vqs;
+
+ vsock->rx_buf_nr = 0;
+ vsock->rx_buf_max_nr = 0;
+ atomic_set(&vsock->queued_replies, 0);
+
+ vdev->priv = vsock;
+ the_virtio_vsock = vsock;
+ mutex_init(&vsock->tx_lock);
+ mutex_init(&vsock->rx_lock);
+ mutex_init(&vsock->event_lock);
+ spin_lock_init(&vsock->send_pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->send_pkt_list);
+ INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+ INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+ INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+ INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+
+ mutex_lock(&vsock->rx_lock);
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->event_lock);
+ virtio_vsock_event_fill(vsock);
+ mutex_unlock(&vsock->event_lock);
+
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return 0;
+
+out_vqs:
+ vsock->vdev->config->del_vqs(vsock->vdev);
+out:
+ kfree(vsock);
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return ret;
+}
+
+static void virtio_vsock_remove(struct virtio_device *vdev)
+{
+ struct virtio_vsock *vsock = vdev->priv;
+ struct virtio_vsock_pkt *pkt;
+
+ flush_work(&vsock->rx_work);
+ flush_work(&vsock->tx_work);
+ flush_work(&vsock->event_work);
+ flush_work(&vsock->send_pkt_work);
+
+ vdev->config->reset(vdev);
+
+ mutex_lock(&vsock->rx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->tx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->tx_lock);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ while (!list_empty(&vsock->send_pkt_list)) {
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ mutex_lock(&the_virtio_vsock_mutex);
+ the_virtio_vsock = NULL;
+ vsock_core_exit();
+ mutex_unlock(&the_virtio_vsock_mutex);
+
+ vdev->config->del_vqs(vdev);
+
+ kfree(vsock);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver virtio_vsock_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtio_vsock_probe,
+ .remove = virtio_vsock_remove,
+};
+
+static int __init virtio_vsock_init(void)
+{
+ int ret;
+
+ virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+ if (!virtio_vsock_workqueue)
+ return -ENOMEM;
+ ret = register_virtio_driver(&virtio_vsock_driver);
+ if (ret)
+ destroy_workqueue(virtio_vsock_workqueue);
+ return ret;
+}
+
+static void __exit virtio_vsock_exit(void)
+{
+ unregister_virtio_driver(&virtio_vsock_driver);
+ destroy_workqueue(virtio_vsock_workqueue);
+}
+
+module_init(virtio_vsock_init);
+module_exit(virtio_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("virtio transport for vsock");
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..9c07c76c504d
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,999 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+ const struct vsock_transport *t = vsock_core_get_transport();
+
+ return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+ size_t len,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct virtio_vsock_pkt *pkt;
+ int err;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return NULL;
+
+ pkt->hdr.type = cpu_to_le16(info->type);
+ pkt->hdr.op = cpu_to_le16(info->op);
+ pkt->hdr.src_cid = cpu_to_le64(src_cid);
+ pkt->hdr.dst_cid = cpu_to_le64(dst_cid);
+ pkt->hdr.src_port = cpu_to_le32(src_port);
+ pkt->hdr.dst_port = cpu_to_le32(dst_port);
+ pkt->hdr.flags = cpu_to_le32(info->flags);
+ pkt->len = len;
+ pkt->hdr.len = cpu_to_le32(len);
+ pkt->reply = info->reply;
+ pkt->vsk = info->vsk;
+
+ if (info->msg && len > 0) {
+ pkt->buf = kmalloc(len, GFP_KERNEL);
+ if (!pkt->buf)
+ goto out_pkt;
+ err = memcpy_from_msg(pkt->buf, info->msg, len);
+ if (err)
+ goto out;
+ }
+
+ trace_virtio_transport_alloc_pkt(src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ info->type,
+ info->op,
+ info->flags);
+
+ return pkt;
+
+out:
+ kfree(pkt->buf);
+out_pkt:
+ kfree(pkt);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt_info *info)
+{
+ u32 src_cid, src_port, dst_cid, dst_port;
+ struct virtio_vsock_sock *vvs;
+ struct virtio_vsock_pkt *pkt;
+ u32 pkt_len = info->pkt_len;
+
+ src_cid = vm_sockets_get_local_cid();
+ src_port = vsk->local_addr.svm_port;
+ if (!info->remote_cid) {
+ dst_cid = vsk->remote_addr.svm_cid;
+ dst_port = vsk->remote_addr.svm_port;
+ } else {
+ dst_cid = info->remote_cid;
+ dst_port = info->remote_port;
+ }
+
+ vvs = vsk->trans;
+
+ /* we can send less than pkt_len bytes */
+ if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+ pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+ /* virtio_transport_get_credit might return less than pkt_len credit */
+ pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+ /* Do not send zero length OP_RW pkt */
+ if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+ return pkt_len;
+
+ pkt = virtio_transport_alloc_pkt(info, pkt_len,
+ src_cid, src_port,
+ dst_cid, dst_port);
+ if (!pkt) {
+ virtio_transport_put_credit(vvs, pkt_len);
+ return -ENOMEM;
+ }
+
+ virtio_transport_inc_tx_pkt(vvs, pkt);
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes -= pkt->len;
+ vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+ pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ u32 ret;
+
+ spin_lock_bh(&vvs->tx_lock);
+ ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (ret > credit)
+ ret = credit;
+ vvs->tx_cnt += ret;
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->tx_cnt -= credit;
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+ int type,
+ struct virtio_vsock_hdr *hdr)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+ .type = type,
+ .vsk = vsk,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ size_t bytes, total = 0;
+ int err = -EFAULT;
+
+ spin_lock_bh(&vvs->rx_lock);
+ while (total < len && !list_empty(&vvs->rx_queue)) {
+ pkt = list_first_entry(&vvs->rx_queue,
+ struct virtio_vsock_pkt, list);
+
+ bytes = len - total;
+ if (bytes > pkt->len - pkt->off)
+ bytes = pkt->len - pkt->off;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ total += bytes;
+ pkt->off += bytes;
+ if (pkt->off == pkt->len) {
+ virtio_transport_dec_rx_pkt(vvs, pkt);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ }
+ spin_unlock_bh(&vvs->rx_lock);
+
+ /* Send a credit pkt to peer */
+ virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
+
+ return total;
+
+out:
+ if (total)
+ err = total;
+ return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+
+ return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->rx_lock);
+ bytes = vvs->rx_bytes;
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (bytes < 0)
+ bytes = 0;
+
+ return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->tx_lock);
+ bytes = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk)
+{
+ struct virtio_vsock_sock *vvs;
+
+ vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+ if (!vvs)
+ return -ENOMEM;
+
+ vsk->trans = vvs;
+ vvs->vsk = vsk;
+ if (psk) {
+ struct virtio_vsock_sock *ptrans = psk->trans;
+
+ vvs->buf_size = ptrans->buf_size;
+ vvs->buf_size_min = ptrans->buf_size_min;
+ vvs->buf_size_max = ptrans->buf_size_max;
+ vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+ } else {
+ vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+ vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+ vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+ }
+
+ vvs->buf_alloc = vvs->buf_size;
+
+ spin_lock_init(&vvs->rx_lock);
+ spin_lock_init(&vvs->tx_lock);
+ INIT_LIST_HEAD(&vvs->rx_queue);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size_min)
+ vvs->buf_size_min = val;
+ if (val > vvs->buf_size_max)
+ vvs->buf_size_max = val;
+ vvs->buf_size = val;
+ vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val > vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now)
+{
+ if (vsock_stream_has_data(vsk))
+ *data_ready_now = true;
+ else
+ *data_ready_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+ size_t target,
+ bool *space_avail_now)
+{
+ s64 free_space;
+
+ free_space = vsock_stream_has_space(vsk);
+ if (free_space > 0)
+ *space_avail_now = true;
+ else if (free_space == 0)
+ *space_avail_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+ size_t target, ssize_t copied, bool data_read,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+ ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_REQUEST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .vsk = vsk,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_SHUTDOWN,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .flags = (mode & RCV_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+ (mode & SEND_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+ .vsk = vsk,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t dgram_len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RW,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .msg = msg,
+ .pkt_len = len,
+ .vsk = vsk,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .reply = !!pkt,
+ .vsk = vsk,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket. There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = le16_to_cpu(pkt->hdr.type),
+ .reply = true,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ pkt = virtio_transport_alloc_pkt(&info, 0,
+ le64_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port),
+ le64_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ if (!pkt)
+ return -ENOMEM;
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ sock_flag(sk, SOCK_DONE)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+ bool cancel_timeout)
+{
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ if (vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+
+ if (vsk->close_work_scheduled &&
+ (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+ vsk->close_work_scheduled = false;
+
+ vsock_remove_sock(vsk);
+
+ /* Release refcnt obtained when we scheduled the timeout */
+ sock_put(sk);
+ }
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+ struct vsock_sock *vsk =
+ container_of(work, struct vsock_sock, close_work.work);
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DONE)) {
+ (void)virtio_transport_reset(vsk, NULL);
+
+ virtio_transport_do_close(vsk, false);
+ }
+
+ vsk->close_work_scheduled = false;
+
+ release_sock(sk);
+ sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+
+ if (!(sk->sk_state == SS_CONNECTED ||
+ sk->sk_state == SS_DISCONNECTING))
+ return true;
+
+ /* Already received SHUTDOWN from peer, reply with RST */
+ if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+ (void)virtio_transport_reset(vsk, NULL);
+ return true;
+ }
+
+ if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+ (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+ if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+ virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+ if (sock_flag(sk, SOCK_DONE)) {
+ return true;
+ }
+
+ sock_hold(sk);
+ INIT_DELAYED_WORK(&vsk->close_work,
+ virtio_transport_close_timeout);
+ vsk->close_work_scheduled = true;
+ schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+ return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+ bool remove_sock = true;
+
+ lock_sock(sk);
+ if (sk->sk_type == SOCK_STREAM)
+ remove_sock = virtio_transport_close(vsk);
+ release_sock(sk);
+
+ if (remove_sock)
+ vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ int err;
+ int skerr;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RESPONSE:
+ sk->sk_state = SS_CONNECTED;
+ sk->sk_socket->state = SS_CONNECTED;
+ vsock_insert_connected(vsk);
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_INVALID:
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ skerr = ECONNRESET;
+ err = 0;
+ goto destroy;
+ default:
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+ return 0;
+
+destroy:
+ virtio_transport_reset(vsk, pkt);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = skerr;
+ sk->sk_error_report(sk);
+ return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ int err = 0;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RW:
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+ pkt->off = 0;
+
+ spin_lock_bh(&vvs->rx_lock);
+ virtio_transport_inc_rx_pkt(vvs, pkt);
+ list_add_tail(&pkt->list, &vvs->rx_queue);
+ spin_unlock_bh(&vvs->rx_lock);
+
+ sk->sk_data_ready(sk);
+ return err;
+ case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+ sk->sk_write_space(sk);
+ break;
+ case VIRTIO_VSOCK_OP_SHUTDOWN:
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+ vsk->peer_shutdown |= RCV_SHUTDOWN;
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+ vsk->peer_shutdown |= SEND_SHUTDOWN;
+ if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+ vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ if (le32_to_cpu(pkt->hdr.flags))
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ virtio_transport_do_close(vsk, true);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ virtio_transport_free_pkt(pkt);
+ return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RESPONSE,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
+ .remote_port = le32_to_cpu(pkt->hdr.src_port),
+ .reply = true,
+ .vsk = vsk,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct vsock_sock *vchild;
+ struct sock *child;
+
+ if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+ virtio_transport_reset(vsk, pkt);
+ return -EINVAL;
+ }
+
+ if (sk_acceptq_is_full(sk)) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+ sk->sk_type, 0);
+ if (!child) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ sk->sk_ack_backlog++;
+
+ lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+ child->sk_state = SS_CONNECTED;
+
+ vchild = vsock_sk(child);
+ vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+ vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+
+ vsock_insert_connected(vchild);
+ vsock_enqueue_accept(sk, child);
+ virtio_transport_send_response(vchild, pkt);
+
+ release_sock(child);
+
+ sk->sk_data_ready(sk);
+ return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct sockaddr_vm src, dst;
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ bool space_available;
+
+ vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+
+ trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+ dst.svm_cid, dst.svm_port,
+ le32_to_cpu(pkt->hdr.len),
+ le16_to_cpu(pkt->hdr.type),
+ le16_to_cpu(pkt->hdr.op),
+ le32_to_cpu(pkt->hdr.flags),
+ le32_to_cpu(pkt->hdr.buf_alloc),
+ le32_to_cpu(pkt->hdr.fwd_cnt));
+
+ if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+
+ /* The socket must be in connected or bound table
+ * otherwise send reset back
+ */
+ sk = vsock_find_connected_socket(&src, &dst);
+ if (!sk) {
+ sk = vsock_find_bound_socket(&dst);
+ if (!sk) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+ }
+
+ vsk = vsock_sk(sk);
+
+ space_available = virtio_transport_space_update(sk, pkt);
+
+ lock_sock(sk);
+
+ /* Update CID in case it has changed after a transport reset event */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (space_available)
+ sk->sk_write_space(sk);
+
+ switch (sk->sk_state) {
+ case VSOCK_SS_LISTEN:
+ virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTING:
+ virtio_transport_recv_connecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTED:
+ virtio_transport_recv_connected(sk, pkt);
+ break;
+ case SS_DISCONNECTING:
+ virtio_transport_recv_disconnecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ default:
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ release_sock(sk);
+
+ /* Release refcnt obtained when we fetched this socket out of the
+ * bound or connected list.
+ */
+ sock_put(sk);
+ return;
+
+free_pkt:
+ virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+ kfree(pkt->buf);
+ kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index d24773552b64..008f3424dcbc 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1676,6 +1676,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
static void vmci_transport_release(struct vsock_sock *vsk)
{
+ vsock_remove_sock(vsk);
+
if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
@@ -1767,11 +1769,8 @@ static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
/* Retrieve the head sk_buff from the socket's receive queue. */
err = 0;
skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
- if (err)
- return err;
-
if (!skb)
- return -EAGAIN;
+ return err;
dg = (struct vmci_datagram *)skb->data;
if (!dg)
@@ -2086,7 +2085,7 @@ static u32 vmci_transport_get_local_cid(void)
return vmci_get_context_id();
}
-static struct vsock_transport vmci_transport = {
+static const struct vsock_transport vmci_transport = {
.init = vmci_transport_socket_init,
.destruct = vmci_transport_destruct,
.release = vmci_transport_release,
@@ -2186,7 +2185,7 @@ module_exit(vmci_transport_exit);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
-MODULE_VERSION("1.0.3.0-k");
+MODULE_VERSION("1.0.4.0-k");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("vmware_vsock");
MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 0467e5ba82e0..5d8ac2d798df 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -4792,6 +4792,13 @@ static void alc280_fixup_hp_9480m(struct hda_codec *codec,
}
}
+static void alc_fixup_disable_mic_vref(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ if (action == HDA_FIXUP_ACT_PRE_PROBE)
+ snd_hda_codec_set_pin_target(codec, 0x19, PIN_VREFHIZ);
+}
+
/* for hda_fixup_thinkpad_acpi() */
#include "thinkpad_helper.c"
@@ -4891,6 +4898,7 @@ enum {
ALC293_FIXUP_LENOVO_SPK_NOISE,
ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY,
ALC255_FIXUP_DELL_SPK_NOISE,
+ ALC225_FIXUP_DISABLE_MIC_VREF,
ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
ALC295_FIXUP_DISABLE_DAC3,
ALC280_FIXUP_HP_HEADSET_MIC,
@@ -5546,6 +5554,12 @@ static const struct hda_fixup alc269_fixups[] = {
.chained = true,
.chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE
},
+ [ALC225_FIXUP_DISABLE_MIC_VREF] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc_fixup_disable_mic_vref,
+ .chained = true,
+ .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+ },
[ALC225_FIXUP_DELL1_MIC_NO_PRESENCE] = {
.type = HDA_FIXUP_VERBS,
.v.verbs = (const struct hda_verb[]) {
@@ -5555,7 +5569,7 @@ static const struct hda_fixup alc269_fixups[] = {
{}
},
.chained = true,
- .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+ .chain_id = ALC225_FIXUP_DISABLE_MIC_VREF
},
[ALC280_FIXUP_HP_HEADSET_MIC] = {
.type = HDA_FIXUP_FUNC,
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
new file mode 100644
index 000000000000..4f93af89ae16
--- /dev/null
+++ b/tools/virtio/linux/dma-mapping.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_DMA_MAPPING_H
+#define _LINUX_DMA_MAPPING_H
+
+#ifdef CONFIG_HAS_DMA
+# error Virtio userspace code does not support CONFIG_HAS_DMA
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 1
+
+enum dma_data_direction {
+ DMA_BIDIRECTIONAL = 0,
+ DMA_TO_DEVICE = 1,
+ DMA_FROM_DEVICE = 2,
+ DMA_NONE = 3,
+};
+
+#endif