aboutsummaryrefslogtreecommitdiffstats
path: root/mm/readahead.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/readahead.c')
-rw-r--r--mm/readahead.c230
1 files changed, 215 insertions, 15 deletions
diff --git a/mm/readahead.c b/mm/readahead.c
index c3c4c30fc121..d3a47546d17d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,6 +8,111 @@
* Initial version.
*/
+/**
+ * DOC: Readahead Overview
+ *
+ * Readahead is used to read content into the page cache before it is
+ * explicitly requested by the application. Readahead only ever
+ * attempts to read pages that are not yet in the page cache. If a
+ * page is present but not up-to-date, readahead will not try to read
+ * it. In that case a simple ->readpage() will be requested.
+ *
+ * Readahead is triggered when an application read request (whether a
+ * systemcall or a page fault) finds that the requested page is not in
+ * the page cache, or that it is in the page cache and has the
+ * %PG_readahead flag set. This flag indicates that the page was loaded
+ * as part of a previous read-ahead request and now that it has been
+ * accessed, it is time for the next read-ahead.
+ *
+ * Each readahead request is partly synchronous read, and partly async
+ * read-ahead. This is reflected in the struct file_ra_state which
+ * contains ->size being to total number of pages, and ->async_size
+ * which is the number of pages in the async section. The first page in
+ * this async section will have %PG_readahead set as a trigger for a
+ * subsequent read ahead. Once a series of sequential reads has been
+ * established, there should be no need for a synchronous component and
+ * all read ahead request will be fully asynchronous.
+ *
+ * When either of the triggers causes a readahead, three numbers need to
+ * be determined: the start of the region, the size of the region, and
+ * the size of the async tail.
+ *
+ * The start of the region is simply the first page address at or after
+ * the accessed address, which is not currently populated in the page
+ * cache. This is found with a simple search in the page cache.
+ *
+ * The size of the async tail is determined by subtracting the size that
+ * was explicitly requested from the determined request size, unless
+ * this would be less than zero - then zero is used. NOTE THIS
+ * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
+ * PAGE.
+ *
+ * The size of the region is normally determined from the size of the
+ * previous readahead which loaded the preceding pages. This may be
+ * discovered from the struct file_ra_state for simple sequential reads,
+ * or from examining the state of the page cache when multiple
+ * sequential reads are interleaved. Specifically: where the readahead
+ * was triggered by the %PG_readahead flag, the size of the previous
+ * readahead is assumed to be the number of pages from the triggering
+ * page to the start of the new readahead. In these cases, the size of
+ * the previous readahead is scaled, often doubled, for the new
+ * readahead, though see get_next_ra_size() for details.
+ *
+ * If the size of the previous read cannot be determined, the number of
+ * preceding pages in the page cache is used to estimate the size of
+ * a previous read. This estimate could easily be misled by random
+ * reads being coincidentally adjacent, so it is ignored unless it is
+ * larger than the current request, and it is not scaled up, unless it
+ * is at the start of file.
+ *
+ * In general read ahead is accelerated at the start of the file, as
+ * reads from there are often sequential. There are other minor
+ * adjustments to the read ahead size in various special cases and these
+ * are best discovered by reading the code.
+ *
+ * The above calculation determines the readahead, to which any requested
+ * read size may be added.
+ *
+ * Readahead requests are sent to the filesystem using the ->readahead()
+ * address space operation, for which mpage_readahead() is a canonical
+ * implementation. ->readahead() should normally initiate reads on all
+ * pages, but may fail to read any or all pages without causing an IO
+ * error. The page cache reading code will issue a ->readpage() request
+ * for any page which ->readahead() does not provided, and only an error
+ * from this will be final.
+ *
+ * ->readahead() will generally call readahead_page() repeatedly to get
+ * each page from those prepared for read ahead. It may fail to read a
+ * page by:
+ *
+ * * not calling readahead_page() sufficiently many times, effectively
+ * ignoring some pages, as might be appropriate if the path to
+ * storage is congested.
+ *
+ * * failing to actually submit a read request for a given page,
+ * possibly due to insufficient resources, or
+ *
+ * * getting an error during subsequent processing of a request.
+ *
+ * In the last two cases, the page should be unlocked to indicate that
+ * the read attempt has failed. In the first case the page will be
+ * unlocked by the caller.
+ *
+ * Those pages not in the final ``async_size`` of the request should be
+ * considered to be important and ->readahead() should not fail them due
+ * to congestion or temporary resource unavailability, but should wait
+ * for necessary resources (e.g. memory or indexing information) to
+ * become available. Pages in the final ``async_size`` may be
+ * considered less urgent and failure to read them is more acceptable.
+ * In this case it is best to use delete_from_page_cache() to remove the
+ * pages from the page cache as is automatically done for pages that
+ * were not fetched with readahead_page(). This will allow a
+ * subsequent synchronous read ahead request to try them again. If they
+ * are left in the page cache, then they will be read individually using
+ * ->readpage().
+ *
+ */
+
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -127,8 +232,17 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
if (aops->readahead) {
aops->readahead(rac);
- /* Clean up the remaining pages */
+ /*
+ * Clean up the remaining pages. The sizes in ->ra
+ * maybe be used to size next read-ahead, so make sure
+ * they accurately reflect what happened.
+ */
while ((page = readahead_page(rac))) {
+ rac->ra->size -= 1;
+ if (rac->ra->async_size > 0) {
+ rac->ra->async_size -= 1;
+ delete_from_page_cache(page);
+ }
unlock_page(page);
put_page(page);
}
@@ -148,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
blk_finish_plug(&plug);
- BUG_ON(!list_empty(pages));
+ BUG_ON(pages && !list_empty(pages));
BUG_ON(readahead_count(rac));
out:
@@ -247,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void do_page_cache_ra(struct readahead_control *ractl,
+static void do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct inode *inode = ractl->mapping->host;
@@ -432,10 +546,102 @@ static int try_context_readahead(struct address_space *mapping,
}
/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
+static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
+ pgoff_t mark, unsigned int order, gfp_t gfp)
+{
+ int err;
+ struct folio *folio = filemap_alloc_folio(gfp, order);
+
+ if (!folio)
+ return -ENOMEM;
+ if (mark - index < (1UL << order))
+ folio_set_readahead(folio);
+ err = filemap_add_folio(ractl->mapping, folio, index, gfp);
+ if (err)
+ folio_put(folio);
+ else
+ ractl->_nr_pages += 1UL << order;
+ return err;
+}
+
+void page_cache_ra_order(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned int new_order)
+{
+ struct address_space *mapping = ractl->mapping;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ pgoff_t mark = index + ra->size - ra->async_size;
+ int err = 0;
+ gfp_t gfp = readahead_gfp_mask(mapping);
+
+ if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ goto fallback;
+
+ limit = min(limit, index + ra->size - 1);
+
+ if (new_order < MAX_PAGECACHE_ORDER) {
+ new_order += 2;
+ if (new_order > MAX_PAGECACHE_ORDER)
+ new_order = MAX_PAGECACHE_ORDER;
+ while ((1 << new_order) > ra->size)
+ new_order--;
+ }
+
+ while (index <= limit) {
+ unsigned int order = new_order;
+
+ /* Align with smaller pages if needed */
+ if (index & ((1UL << order) - 1)) {
+ order = __ffs(index);
+ if (order == 1)
+ order = 0;
+ }
+ /* Don't allocate pages past EOF */
+ while (index + (1UL << order) - 1 > limit) {
+ if (--order == 1)
+ order = 0;
+ }
+ err = ra_alloc_folio(ractl, index, mark, order, gfp);
+ if (err)
+ break;
+ index += 1UL << order;
+ }
+
+ if (index > limit) {
+ ra->size += index - limit - 1;
+ ra->async_size += index - limit - 1;
+ }
+
+ read_pages(ractl, NULL, false);
+
+ /*
+ * If there were already pages in the page cache, then we may have
+ * left some gaps. Let the regular readahead code take care of this
+ * situation.
+ */
+ if (!err)
+ return;
+fallback:
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
+}
+
+/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static void ondemand_readahead(struct readahead_control *ractl,
- bool hit_readahead_marker, unsigned long req_size)
+ struct folio *folio, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
struct file_ra_state *ra = ractl->ra;
@@ -470,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl,
}
/*
- * Hit a marked page without valid readahead state.
+ * Hit a marked folio without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
- if (hit_readahead_marker) {
+ if (folio) {
pgoff_t start;
rcu_read_lock();
@@ -548,7 +754,7 @@ readit:
}
ractl->_index = ra->start;
- do_page_cache_ra(ractl, ra->size, ra->async_size);
+ page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
}
void page_cache_sync_ra(struct readahead_control *ractl,
@@ -576,7 +782,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
}
/* do read-ahead */
- ondemand_readahead(ractl, false, req_count);
+ ondemand_readahead(ractl, NULL, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -595,17 +801,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
folio_clear_readahead(folio);
- /*
- * Defer asynchronous read-ahead on IO congestion.
- */
- if (inode_read_congested(ractl->mapping->host))
- return;
-
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(ractl, true, req_count);
+ ondemand_readahead(ractl, folio, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);