diff --git a/dm-sflc/src/lite/flush.c b/dm-sflc/src/lite/flush.c index dcb39b6..cac0922 100644 --- a/dm-sflc/src/lite/flush.c +++ b/dm-sflc/src/lite/flush.c @@ -27,7 +27,7 @@ #include -static int __serialise_and_encrypt_posmap_blocks(struct sflite_volume *svol, u32 i, u32 j, gfp_t gfp_flags) +static int __serialise_and_encrypt_posmap_blocks(struct sflite_volume *svol, u32 i, u32 j, gfp_t gfp) { u32 first_lsi = i * SFLITE_PSIS_PER_BLOCK; u32 last_lsi = min(j * SFLITE_PSIS_PER_BLOCK, svol->sdev->tot_slices); @@ -47,7 +47,7 @@ static int __serialise_and_encrypt_posmap_blocks(struct sflite_volume *svol, u32 start_ptr, start_ptr, j - i, (SFLITE_POSMAP_START_SECTOR(svol) >> SFLITE_BLOCK_SHIFT) + i, - WRITE, gfp_flags); + WRITE, gfp); } /** @@ -115,7 +115,7 @@ static void cwb_notify_io(unsigned long error, void *context) * Send all CWBs prepared before (marked by flush_pending), and * synchronously wait for them. No lock needs to be held. */ -static int sflite_send_posmap_cwbs(struct sflite_volume *svol, gfp_t gfp) +static int send_posmap_cwbs(struct sflite_volume *svol, gfp_t gfp) { u32 i, j, nblocks; struct dm_io_region region; @@ -187,9 +187,8 @@ static void flush_remap_endio(struct bio *clone) /** * Issue a FLUSH to the underlying device, either brand-new or cloning * an orig_bio coming from the upper layer. - * The status code of the cloned bio is copied onto the original one. */ -int issue_lowlevel_flush(struct sflite_volume *svol, struct bio *orig_bio, gfp_t gfp) +static int issue_lowlevel_flush(struct sflite_volume *svol, struct bio *orig_bio, gfp_t gfp) { struct bio *clone; DECLARE_COMPLETION_ONSTACK(compl); @@ -206,10 +205,9 @@ int issue_lowlevel_flush(struct sflite_volume *svol, struct bio *orig_bio, gfp_t clone->bi_private = &compl; clone->bi_end_io = flush_remap_endio; - // Submit and wait, then copy status code + // Submit and wait dm_submit_bio_remap(orig_bio, clone); wait_for_completion_io(&compl); - orig_bio->bi_status = clone->bi_status; bio_put(clone); return blk_status_to_errno(orig_bio->bi_status); @@ -221,7 +219,7 @@ int issue_lowlevel_flush(struct sflite_volume *svol, struct bio *orig_bio, gfp_t * did not fail, if their sequence number hasn't changed in the meantime (we * detect cache re-dirtying). */ -int mark_posmap_blocks_clean(struct sflite_volume *svol) +static int mark_posmap_blocks_clean(struct sflite_volume *svol) { u32 block; int err = 0; // Set to non-zero if at least one CWB failed @@ -239,6 +237,90 @@ int mark_posmap_blocks_clean(struct sflite_volume *svol) } +/** + * Flush dirty posmap blocks onto the disk. + * Puts together the previous building blocks, with appropriate minimal locking + * (for mutual exclusion with WRITEs and DISCARDs, in critical sections). + * There is a fundamental assumption that this function is never executed + * concurrently: it assumes that it has exclusive lockless access to + * `crypt_entries`, while the CWBs are in flight. + */ +int sflite_flush_posmap(struct sflite_volume *svol, struct bio *orig_bio, gfp_t gfp) +{ + int err; + + /* To prepare the CWBs (encrypt all the `dirty` blocks from `entries` + * onto `crypt_entries`, mark `flush_pending`, and snapshot the `seqnum` + * onto `snap_seqnum`), we need to lock out the WRITEs and the DISCARDs: + * they might write to `dirty` and `seqnum` fields we need to read, + * and they might read from `flush_pending` and `snap_seqnum` fields + * we need to write. */ + if (err = down_write_killable(&svol->posmap.flush_lock)) + goto out; + err = prepare_posmap_cwbs(svol, gfp); + if (err) { + // Clean up flush_pending: give up the FLUSH operation altogether + DMWARN("FLUSH: could not prepare_cwbs(): error %d", err); + bitmap_clear(svol->posmap.flush_pending, 0, svol->sdev->posmap_size_blocks); + } + up_write(&svol->posmap.flush_lock); + if (err) + goto out; + + /* No lock needed to send the CWBs we prepared before: we assume + * non-reentrancy, i.e. no two FLUSHes will ever execute concurrently. + * This guarantees that no-one will touch `crypt_entries` while + * the CWBs are in flight. + * We also *read* from `flush_pending`, but that is fine because all + * the other operations (WRITEs and DISCARDs) also just read it: the + * only writer is this function, which cannot be reentered. + * Here we sleep, waiting for I/O, but since we don't hold locks, the + * WRITEs can proceed in the meantime (we will detect cache re-dirtying + * later). */ + err = send_posmap_cwbs(svol, gfp); + if (err) { + // Clean up and abort FLUSH operation. + DMWARN("FLUSH: could not send_cwbs(): error %d", err); + // All CWB callbacks have finished now, no locking needed + bitmap_clear(svol->posmap.cwb_error, 0, svol->sdev->posmap_size_blocks); + // Need to clear this one under rwsem + down_write(&svol->posmap.flush_lock); // Not killable, for simplicity + bitmap_clear(svol->posmap.flush_pending, 0, svol->sdev->posmap_size_blocks); + up_write(&svol->posmap.flush_lock); + goto out; + } + + /* Whether it's brand-new or we remap one coming from the upper layer, + * here we issue a FLUSH to the underlying device. We had to wait until + * all CWBs returned, so that the FLUSH guarantees apply to them. + * Again no lock needed, so the WRITEs can proceed while we wait for I/O. */ + err = issue_lowlevel_flush(svol, orig_bio, gfp); + if (err) { + // Bummer, we got so far then the FLUSH failed. Clean up and abort. + DMWARN("FLUSH: could not issue_lowlevel_flush(): error %d", err); + // All CWB callbacks have finished now, no locking needed + bitmap_clear(svol->posmap.cwb_error, 0, svol->sdev->posmap_size_blocks); + // Need to clear this one under rwsem + down_write(&svol->posmap.flush_lock); // Not killable, for simplicity + bitmap_clear(svol->posmap.flush_pending, 0, svol->sdev->posmap_size_blocks); + up_write(&svol->posmap.flush_lock); + goto out; + } + + /* The low-level FLUSH succeeded, therefore we can mark clean *some* + * of the posmap blocks: those whose CWB did not fail, and whose seqnum + * did not change while we were not holding the locks. + * If some CWBs failed, we return an error (then failing the upper-layer + * bio in the caller, if there is one), but that doesn't mean aborting + * entirely, because we can still mark some blocks as clean. */ + +out: + if (orig_bio) { + orig_bio->bi_status = errno_to_blk_status(err); + bio_endio(orig_bio); + } + return err; +} /* Landing here from ->map() always through the flush_queue */ diff --git a/dm-sflc/src/lite/sflc_lite.h b/dm-sflc/src/lite/sflc_lite.h index 6f7ff23..c42606e 100644 --- a/dm-sflc/src/lite/sflc_lite.h +++ b/dm-sflc/src/lite/sflc_lite.h @@ -67,10 +67,10 @@ void sflite_flush_work_fn(struct work_struct *work); void sflite_discard_work_fn(struct work_struct *work); /* Position map */ -int sflite_load_and_sanitise_posmap(struct sflite_volume *svol, gfp_t gfp_flags); +int sflite_load_and_sanitise_posmap(struct sflite_volume *svol, gfp_t gfp); int sflite_create_local_slice_mapping(struct sflite_volume *svol, u32 lsi, u32 *psi); int sflite_destroy_local_slice_mapping(struct sflite_volume *svol, u32 lsi); -int sflite_flush_posmap(struct sflite_volume *svol, gfp_t gfp_flags); +int sflite_flush_posmap(struct sflite_volume *svol, struct bio *orig_bio, gfp_t gfp); /* Crypto */ int sflite_crypt_blocks_buf(struct crypto_skcipher *tfm, void *src_buf, void *dst_buf,