inode.c 139 KB
Newer Older
2001
2002
					noalloc_get_block_write)) {
		redirty_page:
2003
2004
2005
2006
			redirty_page_for_writepage(wbc, page);
			unlock_page(page);
			return 0;
		}
2007
2008
2009
2010
2011
		commit_write = 1;
	}
	page_bufs = page_buffers(page);
	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
			      ext4_bh_delay_or_unwritten)) {
2012
		/*
2013
2014
2015
		 * We don't want to do block allocation, so redirty
		 * the page and return.  We may reach here when we do
		 * a journal commit via journal_submit_inode_data_buffers.
2016
2017
2018
		 * We can also reach here via shrink_page_list but it
		 * should never be for direct reclaim so warn if that
		 * happens
2019
		 */
2020
2021
		WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
								PF_MEMALLOC);
2022
2023
2024
		goto redirty_page;
	}
	if (commit_write)
2025
		/* now mark the buffer_heads as dirty and uptodate */
2026
		block_commit_write(page, 0, len);
2027

2028
	if (PageChecked(page) && ext4_should_journal_data(inode))
2029
2030
2031
2032
		/*
		 * It's mmapped pagecache.  Add buffers and journal it.  There
		 * doesn't seem much point in redirtying the page here.
		 */
2033
		return __ext4_journalled_writepage(page, len);
2034

2035
	if (buffer_uninit(page_bufs)) {
2036
2037
2038
2039
		ext4_set_bh_endio(page_bufs, inode);
		ret = block_write_full_page_endio(page, noalloc_get_block_write,
					    wbc, ext4_end_io_buffer_write);
	} else
2040
2041
		ret = block_write_full_page(page, noalloc_get_block_write,
					    wbc);
2042
2043
2044
2045

	return ret;
}

2046
/*
2047
 * This is called via ext4_da_writepages() to
Lucas De Marchi's avatar
Lucas De Marchi committed
2048
 * calculate the total number of credits to reserve to fit
2049
2050
2051
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
2052
 */
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063

static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;

	/*
	 * With non-extent format the journal credit needed to
	 * insert nrblocks contiguous block is dependent on
	 * number of contiguous block. So we will limit
	 * number of contiguous block to a sane value
	 */
2064
	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2065
2066
2067
2068
2069
	    (max_blocks > EXT4_MAX_TRANS_DATA))
		max_blocks = EXT4_MAX_TRANS_DATA;

	return ext4_chunk_trans_blocks(inode, max_blocks);
}
2070

2071
2072
/*
 * write_cache_pages_da - walk the list of dirty pages of the given
2073
 * address space and accumulate pages that need writing, and call
2074
2075
 * mpage_da_map_and_submit to map a single contiguous memory region
 * and then write them.
2076
2077
2078
 */
static int write_cache_pages_da(struct address_space *mapping,
				struct writeback_control *wbc,
2079
2080
				struct mpage_da_data *mpd,
				pgoff_t *done_index)
2081
{
2082
	struct buffer_head	*bh, *head;
2083
	struct inode		*inode = mapping->host;
2084
2085
2086
2087
2088
2089
	struct pagevec		pvec;
	unsigned int		nr_pages;
	sector_t		logical;
	pgoff_t			index, end;
	long			nr_to_write = wbc->nr_to_write;
	int			i, tag, ret = 0;
2090

2091
2092
2093
	memset(mpd, 0, sizeof(struct mpage_da_data));
	mpd->wbc = wbc;
	mpd->inode = inode;
2094
2095
2096
2097
	pagevec_init(&pvec, 0);
	index = wbc->range_start >> PAGE_CACHE_SHIFT;
	end = wbc->range_end >> PAGE_CACHE_SHIFT;

2098
	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2099
2100
2101
2102
		tag = PAGECACHE_TAG_TOWRITE;
	else
		tag = PAGECACHE_TAG_DIRTY;

2103
	*done_index = index;
2104
	while (index <= end) {
2105
		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2106
2107
			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
		if (nr_pages == 0)
2108
			return 0;
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/*
			 * At this point, the page may be truncated or
			 * invalidated (changing page->mapping to NULL), or
			 * even swizzled back from swapper_space to tmpfs file
			 * mapping. However, page->index will not change
			 * because we have a reference on the page.
			 */
2120
2121
			if (page->index > end)
				goto out;
2122

2123
2124
			*done_index = page->index + 1;

2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
			/*
			 * If we can't merge this page, and we have
			 * accumulated an contiguous region, write it
			 */
			if ((mpd->next_page != page->index) &&
			    (mpd->next_page != mpd->first_page)) {
				mpage_da_map_and_submit(mpd);
				goto ret_extent_tail;
			}

2135
2136
2137
			lock_page(page);

			/*
2138
2139
2140
2141
2142
2143
			 * If the page is no longer dirty, or its
			 * mapping no longer corresponds to inode we
			 * are writing (which means it has been
			 * truncated or invalidated), or the page is
			 * already under writeback and we are not
			 * doing a data integrity writeback, skip the page
2144
			 */
2145
2146
2147
2148
			if (!PageDirty(page) ||
			    (PageWriteback(page) &&
			     (wbc->sync_mode == WB_SYNC_NONE)) ||
			    unlikely(page->mapping != mapping)) {
2149
2150
2151
2152
				unlock_page(page);
				continue;
			}

2153
			wait_on_page_writeback(page);
2154
2155
			BUG_ON(PageWriteback(page));

2156
			if (mpd->next_page != page->index)
2157
2158
2159
2160
2161
2162
				mpd->first_page = page->index;
			mpd->next_page = page->index + 1;
			logical = (sector_t) page->index <<
				(PAGE_CACHE_SHIFT - inode->i_blkbits);

			if (!page_has_buffers(page)) {
2163
2164
				mpage_add_bh_to_extent(mpd, logical,
						       PAGE_CACHE_SIZE,
2165
						       (1 << BH_Dirty) | (1 << BH_Uptodate));
2166
2167
				if (mpd->io_done)
					goto ret_extent_tail;
2168
2169
			} else {
				/*
2170
2171
				 * Page with regular buffer heads,
				 * just add all dirty ones
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
				 */
				head = page_buffers(page);
				bh = head;
				do {
					BUG_ON(buffer_locked(bh));
					/*
					 * We need to try to allocate
					 * unmapped blocks in the same page.
					 * Otherwise we won't make progress
					 * with the page in ext4_writepage
					 */
					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
						mpage_add_bh_to_extent(mpd, logical,
								       bh->b_size,
								       bh->b_state);
2187
2188
						if (mpd->io_done)
							goto ret_extent_tail;
2189
2190
					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
						/*
2191
2192
2193
2194
2195
2196
2197
2198
2199
						 * mapped dirty buffer. We need
						 * to update the b_state
						 * because we look at b_state
						 * in mpage_da_map_blocks.  We
						 * don't update b_size because
						 * if we find an unmapped
						 * buffer_head later we need to
						 * use the b_state flag of that
						 * buffer_head.
2200
2201
2202
2203
2204
2205
						 */
						if (mpd->b_size == 0)
							mpd->b_state = bh->b_state & BH_FLAGS;
					}
					logical++;
				} while ((bh = bh->b_this_page) != head);
2206
2207
2208
2209
2210
			}

			if (nr_to_write > 0) {
				nr_to_write--;
				if (nr_to_write == 0 &&
2211
				    wbc->sync_mode == WB_SYNC_NONE)
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
					/*
					 * We stop writing back only if we are
					 * not doing integrity sync. In case of
					 * integrity sync we have to keep going
					 * because someone may be concurrently
					 * dirtying pages, and we might have
					 * synced a lot of newly appeared dirty
					 * pages, but have not synced all of the
					 * old dirty pages.
					 */
2222
					goto out;
2223
2224
2225
2226
2227
			}
		}
		pagevec_release(&pvec);
		cond_resched();
	}
2228
2229
2230
	return 0;
ret_extent_tail:
	ret = MPAGE_DA_EXTENT_TAIL;
2231
2232
2233
out:
	pagevec_release(&pvec);
	cond_resched();
2234
2235
2236
2237
	return ret;
}


2238
static int ext4_da_writepages(struct address_space *mapping,
2239
			      struct writeback_control *wbc)
2240
{
2241
2242
	pgoff_t	index;
	int range_whole = 0;
2243
	handle_t *handle = NULL;
2244
	struct mpage_da_data mpd;
2245
	struct inode *inode = mapping->host;
2246
	int pages_written = 0;
2247
	unsigned int max_pages;
2248
	int range_cyclic, cycled = 1, io_done = 0;
2249
2250
	int needed_blocks, ret = 0;
	long desired_nr_to_write, nr_to_writebump = 0;
2251
	loff_t range_start = wbc->range_start;
2252
	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2253
	pgoff_t done_index = 0;
2254
	pgoff_t end;
2255
	struct blk_plug plug;
2256

2257
	trace_ext4_da_writepages(inode, wbc);
2258

2259
2260
2261
2262
2263
	/*
	 * No pages to write? This is mainly a kludge to avoid starting
	 * a transaction for special inodes like journal inode on last iput()
	 * because that could violate lock ordering on umount
	 */
2264
	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2265
		return 0;
2266
2267
2268
2269
2270

	/*
	 * If the filesystem has aborted, it is read-only, so return
	 * right away instead of dumping stack traces later on that
	 * will obscure the real source of the problem.  We test
2271
	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2272
2273
2274
2275
2276
	 * the latter could be true if the filesystem is mounted
	 * read-only, and in that case, ext4_da_writepages should
	 * *never* be called, so if that ever happens, we would want
	 * the stack trace.
	 */
2277
	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2278
2279
		return -EROFS;

2280
2281
	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
		range_whole = 1;
2282

2283
2284
	range_cyclic = wbc->range_cyclic;
	if (wbc->range_cyclic) {
2285
		index = mapping->writeback_index;
2286
2287
2288
2289
2290
		if (index)
			cycled = 0;
		wbc->range_start = index << PAGE_CACHE_SHIFT;
		wbc->range_end  = LLONG_MAX;
		wbc->range_cyclic = 0;
2291
2292
		end = -1;
	} else {
2293
		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2294
2295
		end = wbc->range_end >> PAGE_CACHE_SHIFT;
	}
2296

2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
	/*
	 * This works around two forms of stupidity.  The first is in
	 * the writeback code, which caps the maximum number of pages
	 * written to be 1024 pages.  This is wrong on multiple
	 * levels; different architectues have a different page size,
	 * which changes the maximum amount of data which gets
	 * written.  Secondly, 4 megabytes is way too small.  XFS
	 * forces this value to be 16 megabytes by multiplying
	 * nr_to_write parameter by four, and then relies on its
	 * allocator to allocate larger extents to make them
	 * contiguous.  Unfortunately this brings us to the second
	 * stupidity, which is that ext4's mballoc code only allocates
	 * at most 2048 blocks.  So we force contiguous writes up to
	 * the number of dirty blocks in the inode, or
	 * sbi->max_writeback_mb_bump whichever is smaller.
	 */
	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2314
2315
2316
2317
2318
2319
	if (!range_cyclic && range_whole) {
		if (wbc->nr_to_write == LONG_MAX)
			desired_nr_to_write = wbc->nr_to_write;
		else
			desired_nr_to_write = wbc->nr_to_write * 8;
	} else
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
							   max_pages);
	if (desired_nr_to_write > max_pages)
		desired_nr_to_write = max_pages;

	if (wbc->nr_to_write < desired_nr_to_write) {
		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
		wbc->nr_to_write = desired_nr_to_write;
	}

2330
retry:
2331
	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2332
2333
		tag_pages_for_writeback(mapping, index, end);

2334
	blk_start_plug(&plug);
2335
	while (!ret && wbc->nr_to_write > 0) {
2336
2337
2338
2339
2340
2341
2342
2343

		/*
		 * we  insert one extent at a time. So we need
		 * credit needed for single extent allocation.
		 * journalled mode is currently not supported
		 * by delalloc
		 */
		BUG_ON(ext4_should_journal_data(inode));
2344
		needed_blocks = ext4_da_writepages_trans_blocks(inode);
2345

2346
2347
2348
2349
		/* start a new transaction*/
		handle = ext4_journal_start(inode, needed_blocks);
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
2350
			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2351
			       "%ld pages, ino %lu; err %d", __func__,
2352
				wbc->nr_to_write, inode->i_ino, ret);
2353
			blk_finish_plug(&plug);
2354
2355
			goto out_writepages;
		}
2356
2357

		/*
2358
		 * Now call write_cache_pages_da() to find the next
2359
		 * contiguous region of logical blocks that need
2360
		 * blocks to be allocated by ext4 and submit them.
2361
		 */
2362
		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
2363
		/*
2364
		 * If we have a contiguous extent of pages and we
2365
2366
2367
2368
		 * haven't done the I/O yet, map the blocks and submit
		 * them for I/O.
		 */
		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2369
			mpage_da_map_and_submit(&mpd);
2370
2371
			ret = MPAGE_DA_EXTENT_TAIL;
		}
2372
		trace_ext4_da_write_pages(inode, &mpd);
2373
		wbc->nr_to_write -= mpd.pages_written;
2374

2375
		ext4_journal_stop(handle);
2376

2377
		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2378
2379
2380
2381
			/* commit the transaction which would
			 * free blocks released in the transaction
			 * and try again
			 */
2382
			jbd2_journal_force_commit_nested(sbi->s_journal);
2383
2384
			ret = 0;
		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
2385
			/*
2386
2387
2388
			 * Got one extent now try with rest of the pages.
			 * If mpd.retval is set -EIO, journal is aborted.
			 * So we don't need to write any more.
2389
			 */
2390
			pages_written += mpd.pages_written;
2391
			ret = mpd.retval;
2392
			io_done = 1;
2393
		} else if (wbc->nr_to_write)
2394
2395
2396
2397
2398
2399
			/*
			 * There is no more writeout needed
			 * or we requested for a noblocking writeout
			 * and we found the device congested
			 */
			break;
2400
	}
2401
	blk_finish_plug(&plug);
2402
2403
2404
2405
2406
2407
2408
	if (!io_done && !cycled) {
		cycled = 1;
		index = 0;
		wbc->range_start = index << PAGE_CACHE_SHIFT;
		wbc->range_end  = mapping->writeback_index - 1;
		goto retry;
	}
2409
2410

	/* Update index */
2411
	wbc->range_cyclic = range_cyclic;
2412
2413
2414
2415
2416
	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
		/*
		 * set the writeback_index so that range_cyclic
		 * mode will write it back later
		 */
2417
		mapping->writeback_index = done_index;
2418

2419
out_writepages:
2420
	wbc->nr_to_write -= nr_to_writebump;
2421
	wbc->range_start = range_start;
2422
	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2423
	return ret;
2424
2425
}

2426
2427
2428
2429
2430
2431
2432
2433
2434
#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
	s64 free_blocks, dirty_blocks;
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	/*
	 * switch to non delalloc mode if we are running low
	 * on free block. The free block accounting via percpu
2435
	 * counters can get slightly wrong with percpu_counter_batch getting
2436
2437
2438
2439
	 * accumulated on each CPU without updating global counters
	 * Delalloc need an accurate free block accounting. So switch
	 * to non delalloc when we are near to error range.
	 */
2440
2441
2442
	free_blocks  = EXT4_C2B(sbi,
		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
	/*
	 * Start pushing delalloc when 1/2 of free blocks are dirty.
	 */
	if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
	    !writeback_in_progress(sb->s_bdi) &&
	    down_read_trylock(&sb->s_umount)) {
		writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
		up_read(&sb->s_umount);
	}

2453
	if (2 * free_blocks < 3 * dirty_blocks ||
2454
		free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2455
		/*
2456
2457
		 * free block count is less than 150% of dirty blocks
		 * or free blocks is less than watermark
2458
2459
2460
2461
2462
2463
		 */
		return 1;
	}
	return 0;
}

2464
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2465
2466
			       loff_t pos, unsigned len, unsigned flags,
			       struct page **pagep, void **fsdata)
2467
{
2468
	int ret, retries = 0;
2469
2470
2471
2472
2473
2474
	struct page *page;
	pgoff_t index;
	struct inode *inode = mapping->host;
	handle_t *handle;

	index = pos >> PAGE_CACHE_SHIFT;
2475
2476
2477
2478
2479
2480
2481

	if (ext4_nonda_switch(inode->i_sb)) {
		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
		return ext4_write_begin(file, mapping, pos,
					len, flags, pagep, fsdata);
	}
	*fsdata = (void *)0;
2482
	trace_ext4_da_write_begin(inode, pos, len, flags);
2483
retry:
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
	/*
	 * With delayed allocation, we don't log the i_disksize update
	 * if there is delayed block allocation. But we still need
	 * to journalling the i_disksize update if writes to the end
	 * of file which has an already mapped buffer.
	 */
	handle = ext4_journal_start(inode, 1);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}
2495
2496
2497
	/* We cannot recurse into the filesystem as the transaction is already
	 * started */
	flags |= AOP_FLAG_NOFS;
2498

2499
	page = grab_cache_page_write_begin(mapping, index, flags);
2500
2501
2502
2503
2504
	if (!page) {
		ext4_journal_stop(handle);
		ret = -ENOMEM;
		goto out;
	}
2505
2506
	*pagep = page;

2507
	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2508
2509
2510
2511
	if (ret < 0) {
		unlock_page(page);
		ext4_journal_stop(handle);
		page_cache_release(page);
2512
2513
2514
2515
2516
2517
		/*
		 * block_write_begin may have instantiated a few blocks
		 * outside i_size.  Trim these off again. Don't need
		 * i_size_read because we hold i_mutex.
		 */
		if (pos + len > inode->i_size)
2518
			ext4_truncate_failed_write(inode);
2519
2520
	}

2521
2522
	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
		goto retry;
2523
2524
2525
2526
out:
	return ret;
}

2527
2528
2529
2530
2531
/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct page *page,
2532
					    unsigned long offset)
2533
2534
2535
2536
2537
2538
2539
2540
2541
{
	struct buffer_head *bh;
	struct inode *inode = page->mapping->host;
	unsigned int idx;
	int i;

	bh = page_buffers(page);
	idx = offset >> inode->i_blkbits;

2542
	for (i = 0; i < idx; i++)
2543
2544
		bh = bh->b_this_page;

2545
	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2546
2547
2548
2549
		return 0;
	return 1;
}

2550
static int ext4_da_write_end(struct file *file,
2551
2552
2553
			     struct address_space *mapping,
			     loff_t pos, unsigned len, unsigned copied,
			     struct page *page, void *fsdata)
2554
2555
2556
2557
2558
{
	struct inode *inode = mapping->host;
	int ret = 0, ret2;
	handle_t *handle = ext4_journal_current_handle();
	loff_t new_i_size;
2559
	unsigned long start, end;
2560
2561
2562
	int write_mode = (int)(unsigned long)fsdata;

	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2563
2564
		switch (ext4_inode_journal_mode(inode)) {
		case EXT4_INODE_ORDERED_DATA_MODE:
2565
2566
			return ext4_ordered_write_end(file, mapping, pos,
					len, copied, page, fsdata);
2567
		case EXT4_INODE_WRITEBACK_DATA_MODE:
2568
2569
			return ext4_writeback_write_end(file, mapping, pos,
					len, copied, page, fsdata);
2570
		default:
2571
2572
2573
			BUG();
		}
	}
2574

2575
	trace_ext4_da_write_end(inode, pos, len, copied);
2576
	start = pos & (PAGE_CACHE_SIZE - 1);
2577
	end = start + copied - 1;
2578
2579
2580
2581
2582
2583
2584
2585

	/*
	 * generic_write_end() will run mark_inode_dirty() if i_size
	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
	 * into that.
	 */

	new_i_size = pos + copied;
2586
	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2587
2588
		if (ext4_da_should_update_i_disksize(page, end)) {
			down_write(&EXT4_I(inode)->i_data_sem);
2589
			if (new_i_size > EXT4_I(inode)->i_disksize)
2590
2591
				EXT4_I(inode)->i_disksize = new_i_size;
			up_write(&EXT4_I(inode)->i_data_sem);
2592
2593
2594
2595
2596
			/* We need to mark inode dirty even if
			 * new_i_size is less that inode->i_size
			 * bu greater than i_disksize.(hint delalloc)
			 */
			ext4_mark_inode_dirty(handle, inode);
2597
		}
2598
	}
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
	ret2 = generic_write_end(file, mapping, pos, len, copied,
							page, fsdata);
	copied = ret2;
	if (ret2 < 0)
		ret = ret2;
	ret2 = ext4_journal_stop(handle);
	if (!ret)
		ret = ret2;

	return ret ? ret : copied;
}

static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
{
	/*
	 * Drop reserved blocks
	 */
	BUG_ON(!PageLocked(page));
	if (!page_has_buffers(page))
		goto out;

2620
	ext4_da_page_release_reservation(page, offset);
2621
2622
2623
2624
2625
2626
2627

out:
	ext4_invalidatepage(page, offset);

	return;
}

2628
2629
2630
2631
2632
/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
2633
2634
	trace_ext4_alloc_da_blocks(inode);

2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
	if (!EXT4_I(inode)->i_reserved_data_blocks &&
	    !EXT4_I(inode)->i_reserved_meta_blocks)
		return 0;

	/*
	 * We do something simple for now.  The filemap_flush() will
	 * also start triggering a write of the data blocks, which is
	 * not strictly speaking necessary (and for users of
	 * laptop_mode, not even desirable).  However, to do otherwise
	 * would require replicating code paths in:
2645
	 *
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
	 * ext4_da_writepages() ->
	 *    write_cache_pages() ---> (via passed in callback function)
	 *        __mpage_da_writepage() -->
	 *           mpage_add_bh_to_extent()
	 *           mpage_da_map_blocks()
	 *
	 * The problem is that write_cache_pages(), located in
	 * mm/page-writeback.c, marks pages clean in preparation for
	 * doing I/O, which is not desirable if we're not planning on
	 * doing I/O at all.
	 *
	 * We could call write_cache_pages(), and then redirty all of
2658
	 * the pages by calling redirty_page_for_writepage() but that
2659
2660
	 * would be ugly in the extreme.  So instead we would need to
	 * replicate parts of the code in the above functions,
Lucas De Marchi's avatar
Lucas De Marchi committed
2661
	 * simplifying them because we wouldn't actually intend to
2662
2663
2664
	 * write out the pages, but rather only collect contiguous
	 * logical block extents, call the multi-block allocator, and
	 * then update the buffer heads with the block allocations.
2665
	 *
2666
2667
2668
2669
2670
2671
	 * For now, though, we'll cheat by calling filemap_flush(),
	 * which will map the blocks, and start the I/O, but not
	 * actually wait for the I/O to complete.
	 */
	return filemap_flush(inode->i_mapping);
}
2672

2673
2674
2675
2676
2677
/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
2678
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
2679
2680
2681
2682
2683
2684
2685
2686
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
2687
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2688
2689
2690
2691
2692
{
	struct inode *inode = mapping->host;
	journal_t *journal;
	int err;

2693
2694
2695
2696
2697
2698
	/*
	 * We can get here for an inline file via the FIBMAP ioctl
	 */
	if (ext4_has_inline_data(inode))
		return 0;

2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
			test_opt(inode->i_sb, DELALLOC)) {
		/*
		 * With delalloc we want to sync the file
		 * so that we can make sure we allocate
		 * blocks for file
		 */
		filemap_write_and_wait(mapping);
	}

2709
2710
	if (EXT4_JOURNAL(inode) &&
	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
		/*
		 * This is a REALLY heavyweight approach, but the use of
		 * bmap on dirty files is expected to be extremely rare:
		 * only if we run lilo or swapon on a freshly made file
		 * do we expect this to happen.
		 *
		 * (bmap requires CAP_SYS_RAWIO so this does not
		 * represent an unprivileged user DOS attack --- we'd be
		 * in trouble if mortal users could trigger this path at
		 * will.)
		 *
2722
		 * NB. EXT4_STATE_JDATA is not set on files other than
2723
2724
2725
2726
2727
2728
		 * regular files.  If somebody wants to bmap a directory
		 * or symlink and gets confused because the buffer
		 * hasn't yet been flushed to disk, they deserve
		 * everything they get.
		 */

2729
		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
2730
		journal = EXT4_JOURNAL(inode);
2731
2732
2733
		jbd2_journal_lock_updates(journal);
		err = jbd2_journal_flush(journal);
		jbd2_journal_unlock_updates(journal);
2734
2735
2736
2737
2738

		if (err)
			return 0;
	}

2739
	return generic_block_bmap(mapping, block, ext4_get_block);
2740
2741
}

2742
static int ext4_readpage(struct file *file, struct page *page)
2743
{
2744
2745
2746
	int ret = -EAGAIN;
	struct inode *inode = page->mapping->host;

2747
	trace_ext4_readpage(page);
2748
2749
2750
2751
2752
2753
2754
2755

	if (ext4_has_inline_data(inode))
		ret = ext4_readpage_inline(inode, page);

	if (ret == -EAGAIN)
		return mpage_readpage(page, ext4_get_block);

	return ret;
2756
2757
2758
}

static int
2759
ext4_readpages(struct file *file, struct address_space *mapping,
2760
2761
		struct list_head *pages, unsigned nr_pages)
{
2762
2763
2764
2765
2766
2767
	struct inode *inode = mapping->host;

	/* If the file has inline data, no need to do readpages. */
	if (ext4_has_inline_data(inode))
		return 0;

2768
	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2769
2770
}

2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
{
	struct buffer_head *head, *bh;
	unsigned int curr_off = 0;

	if (!page_has_buffers(page))
		return;
	head = bh = page_buffers(page);
	do {
		if (offset <= curr_off && test_clear_buffer_uninit(bh)
					&& bh->b_private) {
			ext4_free_io_end(bh->b_private);
			bh->b_private = NULL;
			bh->b_end_io = NULL;
		}
		curr_off = curr_off + bh->b_size;
		bh = bh->b_this_page;
	} while (bh != head);
}

2791
static void ext4_invalidatepage(struct page *page, unsigned long offset)
2792
{
2793
	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2794

2795
2796
	trace_ext4_invalidatepage(page, offset);

2797
2798
2799
2800
2801
	/*
	 * free any io_end structure allocated for buffers to be discarded
	 */
	if (ext4_should_dioread_nolock(page->mapping->host))
		ext4_invalidatepage_free_endio(page, offset);
2802
2803
2804
2805
2806
2807
	/*
	 * If it's a full truncate we just forget about the pending dirtying
	 */
	if (offset == 0)
		ClearPageChecked(page);

2808
2809
2810
2811
	if (journal)
		jbd2_journal_invalidatepage(journal, page, offset);
	else
		block_invalidatepage(page, offset);
2812
2813
}

2814
static int ext4_releasepage(struct page *page, gfp_t wait)
2815
{
2816
	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2817

2818
2819
	trace_ext4_releasepage(page);

2820
2821
2822
	WARN_ON(PageChecked(page));
	if (!page_has_buffers(page))
		return 0;
2823
2824
2825
2826
	if (journal)
		return jbd2_journal_try_to_free_buffers(journal, page, wait);
	else
		return try_to_free_buffers(page);
2827
2828
}

2829
2830
2831
2832
2833
/*
 * ext4_get_block used when preparing for a DIO write or buffer write.
 * We allocate an uinitialized extent if blocks haven't been allocated.
 * The extent will be converted to initialized after the IO is complete.
 */
2834
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2835
2836
		   struct buffer_head *bh_result, int create)
{
2837
	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
2838
		   inode->i_ino, create);
2839
2840
	return _ext4_get_block(inode, iblock, bh_result,
			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
2841
2842
}

2843
static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
2844
		   struct buffer_head *bh_result, int create)
2845
{
2846
2847
2848
2849
	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
		   inode->i_ino, create);
	return _ext4_get_block(inode, iblock, bh_result,
			       EXT4_GET_BLOCKS_NO_LOCK);
2850
2851
}

2852
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2853
2854
			    ssize_t size, void *private, int ret,
			    bool is_async)
2855
{
2856
	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2857
2858
        ext4_io_end_t *io_end = iocb->private;

2859
2860
	/* if not async direct IO or dio with 0 bytes write, just return */
	if (!io_end || !size)
2861
		goto out;
2862

2863
	ext_debug("ext4_end_io_dio(): io_end 0x%p "
2864
		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
2865
2866
2867
 		  iocb->private, io_end->inode->i_ino, iocb, offset,
		  size);

2868
2869
	iocb->private = NULL;

2870
	/* if not aio dio with unwritten extents, just free io and return */
2871
	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2872
		ext4_free_io_end(io_end);
2873
2874
2875
out:
		if (is_async)
			aio_complete(iocb, ret, 0);
2876
		inode_dio_done(inode);
2877
		return;
2878
2879
	}

2880
2881
	io_end->offset = offset;
	io_end->size = size;
2882
2883
2884
2885
	if (is_async) {
		io_end->iocb = iocb;
		io_end->result = ret;
	}
2886

2887
	ext4_add_complete_io(io_end);
2888
}
2889

2890
2891
2892
2893
2894
2895
2896
2897
2898
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
	ext4_io_end_t *io_end = bh->b_private;
	struct inode *inode;

	if (!test_clear_buffer_uninit(bh) || !io_end)
		goto out;

	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2899
2900
2901
		ext4_msg(io_end->inode->i_sb, KERN_INFO,
			 "sb umounted, discard end_io request for inode %lu",
			 io_end->inode->i_ino);
2902
2903
2904
2905
		ext4_free_io_end(io_end);
		goto out;
	}

2906
2907
2908
2909
	/*
	 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
	 * but being more careful is always safe for the future change.
	 */
2910
	inode = io_end->inode;
2911
	ext4_set_io_unwritten_flag(inode, io_end);
2912
	ext4_add_complete_io(io_end);
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
out:
	bh->b_private = NULL;
	bh->b_end_io = NULL;
	clear_buffer_uninit(bh);
	end_buffer_async_write(bh, uptodate);
}

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
{
	ext4_io_end_t *io_end;
	struct page *page = bh->b_page;
	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
	size_t size = bh->b_size;

retry:
	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
	if (!io_end) {
2930
		pr_warn_ratelimited("%s: allocation fail\n", __func__);
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
		schedule();
		goto retry;
	}
	io_end->offset = offset;
	io_end->size = size;
	/*
	 * We need to hold a reference to the page to make sure it
	 * doesn't get evicted before ext4_end_io_work() has a chance
	 * to convert the extent from written to unwritten.
	 */
	io_end->page = page;
	get_page(io_end->page);

	bh->b_private = io_end;
	bh->b_end_io = ext4_end_io_buffer_write;
	return 0;
}

2949
2950
2951
2952
2953
/*
 * For ext4 extent files, ext4 will do direct-io write to holes,
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
2954
 * For holes, we fallocate those blocks, mark them as uninitialized
2955
 * If those blocks were preallocated, we mark sure they are split, but
2956
 * still keep the range to write as uninitialized.
2957
 *
2958
 * The unwritten extents will be converted to written when DIO is completed.
2959
 * For async direct IO, since the IO may still pending when return, we
Lucas De Marchi's avatar
Lucas De Marchi committed
2960
 * set up an end_io call back function, which will do the conversion
2961
 * when async direct IO completed.
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
 *
 * If the O_DIRECT write will extend the file then add this inode to the
 * orphan list.  So recovery will truncate it back to the original size
 * if the machine crashes during the write.
 *
 */
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
			      const struct iovec *iov, loff_t offset,
			      unsigned long nr_segs)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
	ssize_t ret;
	size_t count = iov_length(iov, nr_segs);
2976
2977
2978
	int overwrite = 0;
	get_block_t *get_block_func = NULL;
	int dio_flags = 0;
2979
	loff_t final_size = offset + count;
2980

2981
2982
2983
	/* Use the old path for reads and writes beyond i_size. */
	if (rw != WRITE || final_size > inode->i_size)
		return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2984

2985
	BUG_ON(iocb->private == NULL);
2986

2987
2988
	/* If we do a overwrite dio, i_mutex locking can be released */
	overwrite = *((int *)iocb->private);
2989

2990
2991
2992
2993
2994
	if (overwrite) {
		atomic_inc(&inode->i_dio_count);
		down_read(&EXT4_I(inode)->i_data_sem);
		mutex_unlock(&inode->i_mutex);
	}
2995

2996
2997
2998
2999
3000
	/*
	 * We could direct write to holes and fallocate.
	 *
	 * Allocated blocks to fill the hole are marked as
	 * uninitialized to prevent parallel buffered read to expose
For faster browsing, not all history is shown. View entire blame