diff options
Diffstat (limited to 'mm/readahead.c')
-rw-r--r-- | mm/readahead.c | 230 |
1 files changed, 215 insertions, 15 deletions
diff --git a/mm/readahead.c b/mm/readahead.c index c3c4c30fc121..d3a47546d17d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,111 @@ * Initial version. */ +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application. Readahead only ever + * attempts to read pages that are not yet in the page cache. If a + * page is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->readpage() will be requested. + * + * Readahead is triggered when an application read request (whether a + * systemcall or a page fault) finds that the requested page is not in + * the page cache, or that it is in the page cache and has the + * %PG_readahead flag set. This flag indicates that the page was loaded + * as part of a previous read-ahead request and now that it has been + * accessed, it is time for the next read-ahead. + * + * Each readahead request is partly synchronous read, and partly async + * read-ahead. This is reflected in the struct file_ra_state which + * contains ->size being to total number of pages, and ->async_size + * which is the number of pages in the async section. The first page in + * this async section will have %PG_readahead set as a trigger for a + * subsequent read ahead. Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all read ahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need to + * be determined: the start of the region, the size of the region, and + * the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache. This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used. NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages. This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved. Specifically: where the readahead + * was triggered by the %PG_readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead. In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read. This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general read ahead is accelerated at the start of the file, as + * reads from there are often sequential. There are other minor + * adjustments to the read ahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation determines the readahead, to which any requested + * read size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation. ->readahead() should normally initiate reads on all + * pages, but may fail to read any or all pages without causing an IO + * error. The page cache reading code will issue a ->readpage() request + * for any page which ->readahead() does not provided, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_page() repeatedly to get + * each page from those prepared for read ahead. It may fail to read a + * page by: + * + * * not calling readahead_page() sufficiently many times, effectively + * ignoring some pages, as might be appropriate if the path to + * storage is congested. + * + * * failing to actually submit a read request for a given page, + * possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the page should be unlocked to indicate that + * the read attempt has failed. In the first case the page will be + * unlocked by the caller. + * + * Those pages not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g. memory or indexing information) to + * become available. Pages in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * In this case it is best to use delete_from_page_cache() to remove the + * pages from the page cache as is automatically done for pages that + * were not fetched with readahead_page(). This will allow a + * subsequent synchronous read ahead request to try them again. If they + * are left in the page cache, then they will be read individually using + * ->readpage(). + * + */ + #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> @@ -127,8 +232,17 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, if (aops->readahead) { aops->readahead(rac); - /* Clean up the remaining pages */ + /* + * Clean up the remaining pages. The sizes in ->ra + * maybe be used to size next read-ahead, so make sure + * they accurately reflect what happened. + */ while ((page = readahead_page(rac))) { + rac->ra->size -= 1; + if (rac->ra->async_size > 0) { + rac->ra->async_size -= 1; + delete_from_page_cache(page); + } unlock_page(page); put_page(page); } @@ -148,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_finish_plug(&plug); - BUG_ON(!list_empty(pages)); + BUG_ON(pages && !list_empty(pages)); BUG_ON(readahead_count(rac)); out: @@ -247,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ -void do_page_cache_ra(struct readahead_control *ractl, +static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; @@ -432,10 +546,102 @@ static int try_context_readahead(struct address_space *mapping, } /* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + +static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, + pgoff_t mark, unsigned int order, gfp_t gfp) +{ + int err; + struct folio *folio = filemap_alloc_folio(gfp, order); + + if (!folio) + return -ENOMEM; + if (mark - index < (1UL << order)) + folio_set_readahead(folio); + err = filemap_add_folio(ractl->mapping, folio, index, gfp); + if (err) + folio_put(folio); + else + ractl->_nr_pages += 1UL << order; + return err; +} + +void page_cache_ra_order(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned int new_order) +{ + struct address_space *mapping = ractl->mapping; + pgoff_t index = readahead_index(ractl); + pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + pgoff_t mark = index + ra->size - ra->async_size; + int err = 0; + gfp_t gfp = readahead_gfp_mask(mapping); + + if (!mapping_large_folio_support(mapping) || ra->size < 4) + goto fallback; + + limit = min(limit, index + ra->size - 1); + + if (new_order < MAX_PAGECACHE_ORDER) { + new_order += 2; + if (new_order > MAX_PAGECACHE_ORDER) + new_order = MAX_PAGECACHE_ORDER; + while ((1 << new_order) > ra->size) + new_order--; + } + + while (index <= limit) { + unsigned int order = new_order; + + /* Align with smaller pages if needed */ + if (index & ((1UL << order) - 1)) { + order = __ffs(index); + if (order == 1) + order = 0; + } + /* Don't allocate pages past EOF */ + while (index + (1UL << order) - 1 > limit) { + if (--order == 1) + order = 0; + } + err = ra_alloc_folio(ractl, index, mark, order, gfp); + if (err) + break; + index += 1UL << order; + } + + if (index > limit) { + ra->size += index - limit - 1; + ra->async_size += index - limit - 1; + } + + read_pages(ractl, NULL, false); + + /* + * If there were already pages in the page cache, then we may have + * left some gaps. Let the regular readahead code take care of this + * situation. + */ + if (!err) + return; +fallback: + do_page_cache_ra(ractl, ra->size, ra->async_size); +} + +/* * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, - bool hit_readahead_marker, unsigned long req_size) + struct folio *folio, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra; @@ -470,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl, } /* - * Hit a marked page without valid readahead state. + * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ - if (hit_readahead_marker) { + if (folio) { pgoff_t start; rcu_read_lock(); @@ -548,7 +754,7 @@ readit: } ractl->_index = ra->start; - do_page_cache_ra(ractl, ra->size, ra->async_size); + page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0); } void page_cache_sync_ra(struct readahead_control *ractl, @@ -576,7 +782,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, } /* do read-ahead */ - ondemand_readahead(ractl, false, req_count); + ondemand_readahead(ractl, NULL, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); @@ -595,17 +801,11 @@ void page_cache_async_ra(struct readahead_control *ractl, folio_clear_readahead(folio); - /* - * Defer asynchronous read-ahead on IO congestion. - */ - if (inode_read_congested(ractl->mapping->host)) - return; - if (blk_cgroup_congested()) return; /* do read-ahead */ - ondemand_readahead(ractl, true, req_count); + ondemand_readahead(ractl, folio, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); |