Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f0c98ebc authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull libnvdimm updates from Dan Williams:

 - Replace pcommit with ADR / directed-flushing.

   The pcommit instruction, which has not shipped on any product, is
   deprecated.  Instead, the requirement is that platforms implement
   either ADR, or provide one or more flush addresses per nvdimm.

   ADR (Asynchronous DRAM Refresh) flushes data in posted write buffers
   to the memory controller on a power-fail event.

   Flush addresses are defined in ACPI 6.x as an NVDIMM Firmware
   Interface Table (NFIT) sub-structure: "Flush Hint Address Structure".
   A flush hint is an mmio address that when written and fenced assures
   that all previous posted writes targeting a given dimm have been
   flushed to media.

 - On-demand ARS (address range scrub).

   Linux uses the results of the ACPI ARS commands to track bad blocks
   in pmem devices.  When latent errors are detected we re-scrub the
   media to refresh the bad block list, userspace can also request a
   re-scrub at any time.

 - Support for the Microsoft DSM (device specific method) command
   format.

 - Support for EDK2/OVMF virtual disk device memory ranges.

 - Various fixes and cleanups across the subsystem.

* tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (41 commits)
  libnvdimm-btt: Delete an unnecessary check before the function call "__nd_device_register"
  nfit: do an ARS scrub on hitting a latent media error
  nfit: move to nfit/ sub-directory
  nfit, libnvdimm: allow an ARS scrub to be triggered on demand
  libnvdimm: register nvdimm_bus devices with an nd_bus driver
  pmem: clarify a debug print in pmem_clear_poison
  x86/insn: remove pcommit
  Revert "KVM: x86: add pcommit support"
  nfit, tools/testing/nvdimm/: unify shutdown paths
  libnvdimm: move ->module to struct nvdimm_bus_descriptor
  nfit: cleanup acpi_nfit_init calling convention
  nfit: fix _FIT evaluation memory leak + use after free
  tools/testing/nvdimm: add manufacturing_{date|location} dimm properties
  tools/testing/nvdimm: add virtual ramdisk range
  acpi, nfit: treat virtual ramdisk SPA as pmem region
  pmem: kill __pmem address space
  pmem: kill wmb_pmem()
  libnvdimm, pmem: use nvdimm_flush() for namespace I/O writes
  fs/dax: remove wmb_pmem()
  libnvdimm, pmem: flush posted-write queues on shutdown
  ...
parents d94ba9e7 0606263f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -395,7 +395,7 @@ prototypes:
	int (*release) (struct gendisk *, fmode_t);
	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
	int (*direct_access) (struct block_device *, sector_t, void __pmem **,
	int (*direct_access) (struct block_device *, sector_t, void **,
				unsigned long *);
	int (*media_changed) (struct gendisk *);
	void (*unlock_native_capacity) (struct gendisk *);
+9 −19
Original line number Diff line number Diff line
@@ -256,28 +256,18 @@ If any of these error conditions are encountered, the arena is put into a read
only state using a flag in the info block.


5. In-kernel usage
==================
5. Usage
========

Any block driver that supports byte granularity IO to the storage may register
with the BTT. It will have to provide the rw_bytes interface in its
block_device_operations struct:
The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
(pmem, or blk mode). The easiest way to set up such a namespace is using the
'ndctl' utility [1]:

	int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
For example, the ndctl command line to setup a btt with a 4k sector size is:

It may register with the BTT after it adds its own gendisk, using btt_init:
    ndctl create-namespace -f -e namespace0.0 -m sector -l 4k

	struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
			u32 lbasize, u8 uuid[], int maxlane);
See ndctl create-namespace --help for more options.

note that maxlane is the maximum amount of concurrency the driver wishes to
allow the BTT to use.

The BTT 'disk' appears as a stacked block device that grabs the underlying block
device in the O_EXCL mode.

When the driver wishes to remove the backing disk, it should similarly call
btt_fini using the same struct btt* handle that was provided to it by btt_init.

	void btt_fini(struct btt *btt);
[1]: https://github.com/pmem/ndctl
+2 −2
Original line number Diff line number Diff line
@@ -143,12 +143,12 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 */
static long
axon_ram_direct_access(struct block_device *device, sector_t sector,
		       void __pmem **kaddr, pfn_t *pfn, long size)
		       void **kaddr, pfn_t *pfn, long size)
{
	struct axon_ram_bank *bank = device->bd_disk->private_data;
	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;

	*kaddr = (void __pmem __force *) bank->io_addr + offset;
	*kaddr = (void *) bank->io_addr + offset;
	*pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
	return bank->size - offset;
}
+0 −1
Original line number Diff line number Diff line
@@ -225,7 +225,6 @@
#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
+19 −58
Original line number Diff line number Diff line
@@ -26,13 +26,11 @@
 * @n: length of the copy in bytes
 *
 * Copy data to persistent memory media via non-temporal stores so that
 * a subsequent arch_wmb_pmem() can flush cpu and memory controller
 * write buffers to guarantee durability.
 * a subsequent pmem driver flush operation will drain posted write queues.
 */
static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
		size_t n)
static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
{
	int unwritten;
	int rem;

	/*
	 * We are copying between two kernel buffers, if
@@ -40,59 +38,36 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
	 * fault) we would have already reported a general protection fault
	 * before the WARN+BUG.
	 */
	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
			(void __user *) src, n);
	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
				__func__, dst, src, unwritten))
	rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n);
	if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n",
				__func__, dst, src, rem))
		BUG();
}

static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
		size_t n)
static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{
	if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
		return memcpy_mcsafe(dst, (void __force *) src, n);
	memcpy(dst, (void __force *) src, n);
		return memcpy_mcsafe(dst, src, n);
	memcpy(dst, src, n);
	return 0;
}

/**
 * arch_wmb_pmem - synchronize writes to persistent memory
 *
 * After a series of arch_memcpy_to_pmem() operations this drains data
 * from cpu write buffers and any platform (memory controller) buffers
 * to ensure that written data is durable on persistent memory media.
 */
static inline void arch_wmb_pmem(void)
{
	/*
	 * wmb() to 'sfence' all previous writes such that they are
	 * architecturally visible to 'pcommit'.  Note, that we've
	 * already arranged for pmem writes to avoid the cache via
	 * arch_memcpy_to_pmem().
	 */
	wmb();
	pcommit_sfence();
}

/**
 * arch_wb_cache_pmem - write back a cache range with CLWB
 * @vaddr:	virtual start address
 * @size:	number of bytes to write back
 *
 * Write back a cache range using the CLWB (cache line write back)
 * instruction.  This function requires explicit ordering with an
 * arch_wmb_pmem() call.
 * instruction.
 */
static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
static inline void arch_wb_cache_pmem(void *addr, size_t size)
{
	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
	unsigned long clflush_mask = x86_clflush_size - 1;
	void *vaddr = (void __force *)addr;
	void *vend = vaddr + size;
	void *vend = addr + size;
	void *p;

	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
	for (p = (void *)((unsigned long)addr & ~clflush_mask);
	     p < vend; p += x86_clflush_size)
		clwb(p);
}
@@ -113,16 +88,14 @@ static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
 * @i:		iterator with source data
 *
 * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
 * This function requires explicit ordering with an arch_wmb_pmem() call.
 */
static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
		struct iov_iter *i)
{
	void *vaddr = (void __force *)addr;
	size_t len;

	/* TODO: skip the write-back by always using non-temporal stores */
	len = copy_from_iter_nocache(vaddr, bytes, i);
	len = copy_from_iter_nocache(addr, bytes, i);

	if (__iter_needs_pmem_wb(i))
		arch_wb_cache_pmem(addr, bytes);
@@ -136,28 +109,16 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
 * @size:	number of bytes to zero
 *
 * Write zeros into the memory range starting at 'addr' for 'size' bytes.
 * This function requires explicit ordering with an arch_wmb_pmem() call.
 */
static inline void arch_clear_pmem(void __pmem *addr, size_t size)
static inline void arch_clear_pmem(void *addr, size_t size)
{
	void *vaddr = (void __force *)addr;

	memset(vaddr, 0, size);
	memset(addr, 0, size);
	arch_wb_cache_pmem(addr, size);
}

static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
static inline void arch_invalidate_pmem(void *addr, size_t size)
{
	clflush_cache_range((void __force *) addr, size);
}

static inline bool __arch_has_wmb_pmem(void)
{
	/*
	 * We require that wmb() be an 'sfence', that is only guaranteed on
	 * 64-bit builds
	 */
	return static_cpu_has(X86_FEATURE_PCOMMIT);
	clflush_cache_range(addr, size);
}
#endif /* CONFIG_ARCH_HAS_PMEM_API */
#endif /* __ASM_X86_PMEM_H__ */
Loading