inode.c 139 KB
Newer Older
1001
 * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1002
1003
 * buffers are managed internally.
 */
Nick Piggin's avatar
Nick Piggin committed
1004
static int ext4_ordered_write_end(struct file *file,
1005
1006
1007
				  struct address_space *mapping,
				  loff_t pos, unsigned len, unsigned copied,
				  struct page *page, void *fsdata)
1008
{
1009
	handle_t *handle = ext4_journal_current_handle();
1010
	struct inode *inode = mapping->host;
1011
1012
	int ret = 0, ret2;

1013
	trace_ext4_ordered_write_end(inode, pos, len, copied);
1014
	ret = ext4_jbd2_file_inode(handle, inode);
1015
1016

	if (ret == 0) {
1017
		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
Nick Piggin's avatar
Nick Piggin committed
1018
							page, fsdata);
1019
		copied = ret2;
1020
		if (pos + len > inode->i_size && ext4_can_truncate(inode))
1021
1022
1023
1024
1025
			/* if we have allocated more blocks and copied
			 * less. We will have blocks allocated outside
			 * inode->i_size. So truncate them
			 */
			ext4_orphan_add(handle, inode);
1026
1027
		if (ret2 < 0)
			ret = ret2;
1028
1029
1030
	} else {
		unlock_page(page);
		page_cache_release(page);
1031
	}
1032

1033
	ret2 = ext4_journal_stop(handle);
1034
1035
	if (!ret)
		ret = ret2;
Nick Piggin's avatar
Nick Piggin committed
1036

1037
	if (pos + len > inode->i_size) {
1038
		ext4_truncate_failed_write(inode);
1039
		/*
1040
		 * If truncate failed early the inode might still be
1041
1042
1043
1044
1045
1046
1047
1048
		 * on the orphan list; we need to make sure the inode
		 * is removed from the orphan list in that case.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}


Nick Piggin's avatar
Nick Piggin committed
1049
	return ret ? ret : copied;
1050
1051
}

Nick Piggin's avatar
Nick Piggin committed
1052
static int ext4_writeback_write_end(struct file *file,
1053
1054
1055
				    struct address_space *mapping,
				    loff_t pos, unsigned len, unsigned copied,
				    struct page *page, void *fsdata)
1056
{
1057
	handle_t *handle = ext4_journal_current_handle();
1058
	struct inode *inode = mapping->host;
1059
1060
	int ret = 0, ret2;

1061
	trace_ext4_writeback_write_end(inode, pos, len, copied);
1062
	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
Nick Piggin's avatar
Nick Piggin committed
1063
							page, fsdata);
1064
	copied = ret2;
1065
	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1066
1067
1068
1069
1070
1071
		/* if we have allocated more blocks and copied
		 * less. We will have blocks allocated outside
		 * inode->i_size. So truncate them
		 */
		ext4_orphan_add(handle, inode);

1072
1073
	if (ret2 < 0)
		ret = ret2;
1074

1075
	ret2 = ext4_journal_stop(handle);
1076
1077
	if (!ret)
		ret = ret2;
Nick Piggin's avatar
Nick Piggin committed
1078

1079
	if (pos + len > inode->i_size) {
1080
		ext4_truncate_failed_write(inode);
1081
		/*
1082
		 * If truncate failed early the inode might still be
1083
1084
1085
1086
1087
1088
1089
		 * on the orphan list; we need to make sure the inode
		 * is removed from the orphan list in that case.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}

Nick Piggin's avatar
Nick Piggin committed
1090
	return ret ? ret : copied;
1091
1092
}

Nick Piggin's avatar
Nick Piggin committed
1093
static int ext4_journalled_write_end(struct file *file,
1094
1095
1096
				     struct address_space *mapping,
				     loff_t pos, unsigned len, unsigned copied,
				     struct page *page, void *fsdata)
1097
{
1098
	handle_t *handle = ext4_journal_current_handle();
Nick Piggin's avatar
Nick Piggin committed
1099
	struct inode *inode = mapping->host;
1100
1101
	int ret = 0, ret2;
	int partial = 0;
Nick Piggin's avatar
Nick Piggin committed
1102
	unsigned from, to;
1103
	loff_t new_i_size;
1104

1105
	trace_ext4_journalled_write_end(inode, pos, len, copied);
Nick Piggin's avatar
Nick Piggin committed
1106
1107
1108
	from = pos & (PAGE_CACHE_SIZE - 1);
	to = from + len;

1109
1110
	BUG_ON(!ext4_handle_valid(handle));

Nick Piggin's avatar
Nick Piggin committed
1111
1112
1113
1114
1115
	if (copied < len) {
		if (!PageUptodate(page))
			copied = 0;
		page_zero_new_buffers(page, from+copied, to);
	}
1116
1117

	ret = walk_page_buffers(handle, page_buffers(page), from,
Nick Piggin's avatar
Nick Piggin committed
1118
				to, &partial, write_end_fn);
1119
1120
	if (!partial)
		SetPageUptodate(page);
1121
1122
	new_i_size = pos + copied;
	if (new_i_size > inode->i_size)
Nick Piggin's avatar
Nick Piggin committed
1123
		i_size_write(inode, pos+copied);
1124
	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1125
	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1126
1127
	if (new_i_size > EXT4_I(inode)->i_disksize) {
		ext4_update_i_disksize(inode, new_i_size);
1128
		ret2 = ext4_mark_inode_dirty(handle, inode);
1129
1130
1131
		if (!ret)
			ret = ret2;
	}
Nick Piggin's avatar
Nick Piggin committed
1132

1133
	unlock_page(page);
1134
	page_cache_release(page);
1135
	if (pos + len > inode->i_size && ext4_can_truncate(inode))
1136
1137
1138
1139
1140
1141
		/* if we have allocated more blocks and copied
		 * less. We will have blocks allocated outside
		 * inode->i_size. So truncate them
		 */
		ext4_orphan_add(handle, inode);

1142
	ret2 = ext4_journal_stop(handle);
1143
1144
	if (!ret)
		ret = ret2;
1145
	if (pos + len > inode->i_size) {
1146
		ext4_truncate_failed_write(inode);
1147
		/*
1148
		 * If truncate failed early the inode might still be
1149
1150
1151
1152
1153
1154
		 * on the orphan list; we need to make sure the inode
		 * is removed from the orphan list in that case.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}
Nick Piggin's avatar
Nick Piggin committed
1155
1156

	return ret ? ret : copied;
1157
}
1158

1159
/*
1160
 * Reserve a single cluster located at lblock
1161
 */
1162
static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1163
{
1164
	int retries = 0;
1165
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1166
	struct ext4_inode_info *ei = EXT4_I(inode);
1167
	unsigned int md_needed;
1168
	int ret;
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
	ext4_lblk_t save_last_lblock;
	int save_len;

	/*
	 * We will charge metadata quota at writeout time; this saves
	 * us from metadata over-estimation, though we may go over by
	 * a small amount in the end.  Here we just reserve for data.
	 */
	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
	if (ret)
		return ret;
1180
1181
1182
1183
1184
1185

	/*
	 * recalculate the amount of metadata blocks to reserve
	 * in order to allocate nrblocks
	 * worse case is one extent per block
	 */
1186
repeat:
1187
	spin_lock(&ei->i_block_reservation_lock);
1188
1189
1190
1191
1192
1193
	/*
	 * ext4_calc_metadata_amount() has side effects, which we have
	 * to be prepared undo if we fail to claim space.
	 */
	save_len = ei->i_da_metadata_calc_len;
	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
1194
1195
	md_needed = EXT4_NUM_B2C(sbi,
				 ext4_calc_metadata_amount(inode, lblock));
1196
	trace_ext4_da_reserve_space(inode, md_needed);
1197

1198
1199
1200
1201
	/*
	 * We do still charge estimated metadata to the sb though;
	 * we cannot afford to run out of free blocks.
	 */
1202
	if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1203
1204
1205
		ei->i_da_metadata_calc_len = save_len;
		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
		spin_unlock(&ei->i_block_reservation_lock);
1206
1207
1208
1209
		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
			yield();
			goto repeat;
		}
1210
		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1211
1212
		return -ENOSPC;
	}
1213
	ei->i_reserved_data_blocks++;
1214
1215
	ei->i_reserved_meta_blocks += md_needed;
	spin_unlock(&ei->i_block_reservation_lock);
1216

1217
1218
1219
	return 0;       /* success */
}

1220
static void ext4_da_release_space(struct inode *inode, int to_free)
1221
1222
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1223
	struct ext4_inode_info *ei = EXT4_I(inode);
1224

1225
1226
1227
	if (!to_free)
		return;		/* Nothing to release, exit */

1228
	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1229

Li Zefan's avatar
Li Zefan committed
1230
	trace_ext4_da_release_space(inode, to_free);
1231
	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1232
		/*
1233
1234
1235
1236
		 * if there aren't enough reserved blocks, then the
		 * counter is messed up somewhere.  Since this
		 * function is called from invalidate page, it's
		 * harmless to return without any action.
1237
		 */
1238
1239
		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
			 "ino %lu, to_free %d with only %d reserved "
1240
			 "data blocks", inode->i_ino, to_free,
1241
1242
1243
			 ei->i_reserved_data_blocks);
		WARN_ON(1);
		to_free = ei->i_reserved_data_blocks;
1244
	}
1245
	ei->i_reserved_data_blocks -= to_free;
1246

1247
1248
1249
1250
1251
	if (ei->i_reserved_data_blocks == 0) {
		/*
		 * We can release all of the reserved metadata blocks
		 * only when we have written all of the delayed
		 * allocation blocks.
1252
1253
		 * Note that in case of bigalloc, i_reserved_meta_blocks,
		 * i_reserved_data_blocks, etc. refer to number of clusters.
1254
		 */
1255
		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1256
				   ei->i_reserved_meta_blocks);
1257
		ei->i_reserved_meta_blocks = 0;
1258
		ei->i_da_metadata_calc_len = 0;
1259
	}
1260

1261
	/* update fs dirty data blocks counter */
1262
	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1263
1264

	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1265

1266
	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1267
1268
1269
}

static void ext4_da_page_release_reservation(struct page *page,
1270
					     unsigned long offset)
1271
1272
1273
1274
{
	int to_release = 0;
	struct buffer_head *head, *bh;
	unsigned int curr_off = 0;
1275
1276
1277
	struct inode *inode = page->mapping->host;
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	int num_clusters;
1278
	ext4_fsblk_t lblk;
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290

	head = page_buffers(page);
	bh = head;
	do {
		unsigned int next_off = curr_off + bh->b_size;

		if ((offset <= curr_off) && (buffer_delay(bh))) {
			to_release++;
			clear_buffer_delay(bh);
		}
		curr_off = next_off;
	} while ((bh = bh->b_this_page) != head);
1291

1292
1293
1294
1295
1296
	if (to_release) {
		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
		ext4_es_remove_extent(inode, lblk, to_release);
	}

1297
1298
1299
1300
1301
1302
1303
	/* If we have released all the blocks belonging to a cluster, then we
	 * need to release the reserved space for that cluster. */
	num_clusters = EXT4_NUM_B2C(sbi, to_release);
	while (num_clusters > 0) {
		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
			((num_clusters - 1) << sbi->s_cluster_bits);
		if (sbi->s_cluster_ratio == 1 ||
1304
		    !ext4_find_delalloc_cluster(inode, lblk))
1305
1306
1307
1308
			ext4_da_release_space(inode, 1);

		num_clusters--;
	}
1309
}
1310

1311
1312
1313
1314
1315
1316
/*
 * Delayed allocation stuff
 */

/*
 * mpage_da_submit_io - walks through extent of pages and try to write
1317
 * them with writepage() call back
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
 *
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
 * @mpd->next_page: page after the last page of the extent
 *
 * By the time mpage_da_submit_io() is called we expect all blocks
 * to be allocated. this may be wrong if allocation failed.
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
1328
1329
static int mpage_da_submit_io(struct mpage_da_data *mpd,
			      struct ext4_map_blocks *map)
1330
{
1331
1332
1333
1334
1335
	struct pagevec pvec;
	unsigned long index, end;
	int ret = 0, err, nr_pages, i;
	struct inode *inode = mpd->inode;
	struct address_space *mapping = inode->i_mapping;
1336
	loff_t size = i_size_read(inode);
1337
1338
	unsigned int len, block_start;
	struct buffer_head *bh, *page_bufs = NULL;
1339
	int journal_data = ext4_should_journal_data(inode);
1340
	sector_t pblock = 0, cur_logical = 0;
1341
	struct ext4_io_submit io_submit;
1342
1343

	BUG_ON(mpd->next_page <= mpd->first_page);
1344
	memset(&io_submit, 0, sizeof(io_submit));
1345
1346
1347
	/*
	 * We need to start from the first_page to the next_page - 1
	 * to make sure we also write the mapped dirty buffer_heads.
1348
	 * If we look at mpd->b_blocknr we would only be looking
1349
1350
	 * at the currently mapped buffer_heads.
	 */
1351
1352
1353
	index = mpd->first_page;
	end = mpd->next_page - 1;

1354
	pagevec_init(&pvec, 0);
1355
	while (index <= end) {
1356
		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1357
1358
1359
		if (nr_pages == 0)
			break;
		for (i = 0; i < nr_pages; i++) {
1360
			int commit_write = 0, skip_page = 0;
1361
1362
			struct page *page = pvec.pages[i];

1363
1364
1365
			index = page->index;
			if (index > end)
				break;
1366
1367
1368
1369
1370

			if (index == size >> PAGE_CACHE_SHIFT)
				len = size & ~PAGE_CACHE_MASK;
			else
				len = PAGE_CACHE_SIZE;
1371
1372
1373
1374
1375
1376
			if (map) {
				cur_logical = index << (PAGE_CACHE_SHIFT -
							inode->i_blkbits);
				pblock = map->m_pblk + (cur_logical -
							map->m_lblk);
			}
1377
1378
1379
1380
1381
			index++;

			BUG_ON(!PageLocked(page));
			BUG_ON(PageWriteback(page));

1382
			/*
1383
1384
			 * If the page does not have buffers (for
			 * whatever reason), try to create them using
1385
			 * __block_write_begin.  If this fails,
1386
			 * skip the page and move on.
1387
			 */
1388
			if (!page_has_buffers(page)) {
1389
				if (__block_write_begin(page, 0, len,
1390
						noalloc_get_block_write)) {
1391
				skip_page:
1392
1393
1394
1395
1396
					unlock_page(page);
					continue;
				}
				commit_write = 1;
			}
1397

1398
1399
			bh = page_bufs = page_buffers(page);
			block_start = 0;
1400
			do {
1401
				if (!bh)
1402
					goto skip_page;
1403
1404
1405
				if (map && (cur_logical >= map->m_lblk) &&
				    (cur_logical <= (map->m_lblk +
						     (map->m_len - 1)))) {
1406
1407
1408
1409
					if (buffer_delay(bh)) {
						clear_buffer_delay(bh);
						bh->b_blocknr = pblock;
					}
1410
1411
1412
1413
1414
1415
1416
					if (buffer_unwritten(bh) ||
					    buffer_mapped(bh))
						BUG_ON(bh->b_blocknr != pblock);
					if (map->m_flags & EXT4_MAP_UNINIT)
						set_buffer_uninit(bh);
					clear_buffer_unwritten(bh);
				}
1417

1418
1419
1420
1421
1422
				/*
				 * skip page if block allocation undone and
				 * block is dirty
				 */
				if (ext4_bh_delay_or_unwritten(NULL, bh))
1423
					skip_page = 1;
1424
1425
				bh = bh->b_this_page;
				block_start += bh->b_size;
1426
1427
				cur_logical++;
				pblock++;
1428
1429
			} while (bh != page_bufs);

1430
1431
			if (skip_page)
				goto skip_page;
1432
1433
1434
1435
1436

			if (commit_write)
				/* mark the buffer_heads as dirty & uptodate */
				block_commit_write(page, 0, len);

1437
			clear_page_dirty_for_io(page);
1438
1439
1440
1441
1442
1443
			/*
			 * Delalloc doesn't support data journalling,
			 * but eventually maybe we'll lift this
			 * restriction.
			 */
			if (unlikely(journal_data && PageChecked(page)))
1444
				err = __ext4_journalled_writepage(page, len);
1445
			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1446
1447
				err = ext4_bio_write_page(&io_submit, page,
							  len, mpd->wbc);
1448
1449
1450
1451
1452
1453
			else if (buffer_uninit(page_bufs)) {
				ext4_set_bh_endio(page_bufs, inode);
				err = block_write_full_page_endio(page,
					noalloc_get_block_write,
					mpd->wbc, ext4_end_io_buffer_write);
			} else
1454
1455
				err = block_write_full_page(page,
					noalloc_get_block_write, mpd->wbc);
1456
1457

			if (!err)
1458
				mpd->pages_written++;
1459
1460
1461
1462
1463
1464
1465
1466
1467
			/*
			 * In error case, we have to continue because
			 * remaining pages are still locked
			 */
			if (ret == 0)
				ret = err;
		}
		pagevec_release(&pvec);
	}
1468
	ext4_io_submit(&io_submit);
1469
1470
1471
	return ret;
}

1472
static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1473
1474
1475
1476
1477
1478
{
	int nr_pages, i;
	pgoff_t index, end;
	struct pagevec pvec;
	struct inode *inode = mpd->inode;
	struct address_space *mapping = inode->i_mapping;
1479
	ext4_lblk_t start, last;
1480

1481
1482
	index = mpd->first_page;
	end   = mpd->next_page - 1;
1483
1484
1485
1486
1487

	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	ext4_es_remove_extent(inode, start, last - start + 1);

1488
	pagevec_init(&pvec, 0);
1489
1490
1491
1492
1493
1494
	while (index <= end) {
		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
		if (nr_pages == 0)
			break;
		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];
1495
			if (page->index > end)
1496
1497
1498
1499
1500
1501
1502
				break;
			BUG_ON(!PageLocked(page));
			BUG_ON(PageWriteback(page));
			block_invalidatepage(page, 0);
			ClearPageUptodate(page);
			unlock_page(page);
		}
1503
1504
		index = pvec.pages[nr_pages - 1]->index + 1;
		pagevec_release(&pvec);
1505
1506
1507
1508
	}
	return;
}

1509
1510
1511
static void ext4_print_free_blocks(struct inode *inode)
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1512
1513
1514
	struct super_block *sb = inode->i_sb;

	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1515
1516
	       EXT4_C2B(EXT4_SB(inode->i_sb),
			ext4_count_free_clusters(inode->i_sb)));
1517
1518
	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1519
1520
	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
		percpu_counter_sum(&sbi->s_freeclusters_counter)));
1521
	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1522
1523
	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1524
1525
1526
1527
	ext4_msg(sb, KERN_CRIT, "Block reservation details");
	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
		 EXT4_I(inode)->i_reserved_data_blocks);
	ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1528
	       EXT4_I(inode)->i_reserved_meta_blocks);
1529
1530
1531
	return;
}

1532
/*
1533
1534
 * mpage_da_map_and_submit - go through given space, map them
 *       if necessary, and then submit them for I/O
1535
 *
1536
 * @mpd - bh describing space
1537
1538
1539
1540
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
1541
static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1542
{
1543
	int err, blks, get_blocks_flags;
1544
	struct ext4_map_blocks map, *mapp = NULL;
1545
1546
1547
1548
	sector_t next = mpd->b_blocknr;
	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
	handle_t *handle = NULL;
1549
1550

	/*
1551
1552
	 * If the blocks are mapped already, or we couldn't accumulate
	 * any blocks, then proceed immediately to the submission stage.
1553
	 */
1554
1555
1556
1557
1558
	if ((mpd->b_size == 0) ||
	    ((mpd->b_state  & (1 << BH_Mapped)) &&
	     !(mpd->b_state & (1 << BH_Delay)) &&
	     !(mpd->b_state & (1 << BH_Unwritten))))
		goto submit_io;
1559
1560
1561
1562

	handle = ext4_journal_current_handle();
	BUG_ON(!handle);

1563
	/*
1564
	 * Call ext4_map_blocks() to allocate any delayed allocation
1565
1566
1567
1568
1569
1570
1571
1572
	 * blocks, or to convert an uninitialized extent to be
	 * initialized (in the case where we have written into
	 * one or more preallocated blocks).
	 *
	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
	 * indicate that we are on the delayed allocation path.  This
	 * affects functions in many different parts of the allocation
	 * call path.  This flag exists primarily because we don't
1573
	 * want to change *many* call functions, so ext4_map_blocks()
1574
	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1575
1576
1577
1578
1579
	 * inode's allocation semaphore is taken.
	 *
	 * If the blocks in questions were delalloc blocks, set
	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
	 * variables are updated after the blocks have been allocated.
1580
	 */
1581
1582
	map.m_lblk = next;
	map.m_len = max_blocks;
1583
	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
1584
1585
	if (ext4_should_dioread_nolock(mpd->inode))
		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1586
	if (mpd->b_state & (1 << BH_Delay))
1587
1588
		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

1589
	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1590
	if (blks < 0) {
1591
1592
		struct super_block *sb = mpd->inode->i_sb;

1593
		err = blks;
1594
		/*
1595
		 * If get block returns EAGAIN or ENOSPC and there
1596
1597
		 * appears to be free blocks we will just let
		 * mpage_da_submit_io() unlock all of the pages.
1598
1599
		 */
		if (err == -EAGAIN)
1600
			goto submit_io;
1601

1602
		if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1603
			mpd->retval = err;
1604
			goto submit_io;
1605
1606
		}

1607
		/*
1608
1609
1610
1611
1612
		 * get block failure will cause us to loop in
		 * writepages, because a_ops->writepage won't be able
		 * to make progress. The page will be redirtied by
		 * writepage and writepages will again try to write
		 * the same.
1613
		 */
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
			ext4_msg(sb, KERN_CRIT,
				 "delayed block allocation failed for inode %lu "
				 "at logical offset %llu with max blocks %zd "
				 "with error %d", mpd->inode->i_ino,
				 (unsigned long long) next,
				 mpd->b_size >> mpd->inode->i_blkbits, err);
			ext4_msg(sb, KERN_CRIT,
				"This should not happen!! Data will be lost\n");
			if (err == -ENOSPC)
				ext4_print_free_blocks(mpd->inode);
1625
		}
1626
		/* invalidate all the pages */
1627
		ext4_da_block_invalidatepages(mpd);
1628
1629
1630

		/* Mark this page range as having been completed */
		mpd->io_done = 1;
1631
		return;
1632
	}
1633
1634
	BUG_ON(blks == 0);

1635
	mapp = &map;
1636
1637
1638
	if (map.m_flags & EXT4_MAP_NEW) {
		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
		int i;
1639

1640
1641
		for (i = 0; i < map.m_len; i++)
			unmap_underlying_metadata(bdev, map.m_pblk + i);
1642
1643
1644
	}

	/*
1645
	 * Update on-disk size along with block allocation.
1646
1647
1648
1649
1650
1651
	 */
	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
	if (disksize > i_size_read(mpd->inode))
		disksize = i_size_read(mpd->inode);
	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
		ext4_update_i_disksize(mpd->inode, disksize);
1652
1653
1654
1655
1656
		err = ext4_mark_inode_dirty(handle, mpd->inode);
		if (err)
			ext4_error(mpd->inode->i_sb,
				   "Failed to mark inode %lu dirty",
				   mpd->inode->i_ino);
1657
1658
	}

1659
submit_io:
1660
	mpage_da_submit_io(mpd, mapp);
1661
	mpd->io_done = 1;
1662
1663
}

1664
1665
#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
		(1 << BH_Delay) | (1 << BH_Unwritten))
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676

/*
 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
 *
 * @mpd->lbh - extent of blocks
 * @logical - logical number of the block in the file
 * @bh - bh of the block (used to access block's state)
 *
 * the function is used to collect contig. blocks in same state
 */
static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1677
1678
				   sector_t logical, size_t b_size,
				   unsigned long b_state)
1679
1680
{
	sector_t next;
1681
	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1682

1683
1684
1685
1686
	/*
	 * XXX Don't go larger than mballoc is willing to allocate
	 * This is a stopgap solution.  We eventually need to fold
	 * mpage_da_submit_io() into this function and then call
1687
	 * ext4_map_blocks() multiple times in a loop
1688
1689
1690
1691
	 */
	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
		goto flush_it;

1692
	/* check if thereserved journal credits might overflow */
1693
	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
			/*
			 * With non-extent format we are limited by the journal
			 * credit available.  Total credit needed to insert
			 * nrblocks contiguous blocks is dependent on the
			 * nrblocks.  So limit nrblocks.
			 */
			goto flush_it;
		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
				EXT4_MAX_TRANS_DATA) {
			/*
			 * Adding the new buffer_head would make it cross the
			 * allowed limit for which we have journal credit
			 * reserved. So limit the new bh->b_size
			 */
			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
						mpd->inode->i_blkbits;
			/* we will do mpage_da_submit_io in the next loop */
		}
	}
1714
1715
1716
	/*
	 * First block in the extent
	 */
1717
1718
1719
1720
	if (mpd->b_size == 0) {
		mpd->b_blocknr = logical;
		mpd->b_size = b_size;
		mpd->b_state = b_state & BH_FLAGS;
1721
1722
1723
		return;
	}

1724
	next = mpd->b_blocknr + nrblocks;
1725
1726
1727
	/*
	 * Can we merge the block to our big extent?
	 */
1728
1729
	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
		mpd->b_size += b_size;
1730
1731
1732
		return;
	}

1733
flush_it:
1734
1735
1736
1737
	/*
	 * We couldn't merge the block to our extent, so we
	 * need to flush current  extent and start new one
	 */
1738
	mpage_da_map_and_submit(mpd);
1739
	return;
1740
1741
}

1742
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1743
{
1744
	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
1745
1746
}

1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
/*
 * This function is grabs code from the very beginning of
 * ext4_map_blocks, but assumes that the caller is from delayed write
 * time. This function looks up the requested blocks and sets the
 * buffer delay bit under the protection of i_data_sem.
 */
static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
			      struct ext4_map_blocks *map,
			      struct buffer_head *bh)
{
	int retval;
	sector_t invalid_block = ~((sector_t) 0xffff);

	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
		invalid_block = ~0;

	map->m_flags = 0;
	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
		  "logical block %lu\n", inode->i_ino, map->m_len,
		  (unsigned long) map->m_lblk);
	/*
	 * Try to see if we can get the block without requesting a new
	 * file system block.
	 */
	down_read((&EXT4_I(inode)->i_data_sem));
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
	else
		retval = ext4_ind_map_blocks(NULL, inode, map, 0);

	if (retval == 0) {
		/*
		 * XXX: __block_prepare_write() unmaps passed block,
		 * is it OK?
		 */
		/* If the block was allocated from previously allocated cluster,
		 * then we dont need to reserve it again. */
		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
			retval = ext4_da_reserve_space(inode, iblock);
			if (retval)
				/* not enough space to reserve */
				goto out_unlock;
		}

1791
1792
1793
1794
		retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
		if (retval)
			goto out_unlock;

1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
		 * and it should not appear on the bh->b_state.
		 */
		map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;

		map_bh(bh, inode->i_sb, invalid_block);
		set_buffer_new(bh);
		set_buffer_delay(bh);
	}

out_unlock:
	up_read((&EXT4_I(inode)->i_data_sem));

	return retval;
}

1811
/*
1812
1813
1814
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
1815
1816
1817
1818
1819
1820
1821
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
1822
1823
 */
static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1824
				  struct buffer_head *bh, int create)
1825
{
1826
	struct ext4_map_blocks map;
1827
1828
1829
	int ret = 0;

	BUG_ON(create == 0);
1830
1831
1832
1833
	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

	map.m_lblk = iblock;
	map.m_len = 1;
1834
1835
1836
1837
1838
1839

	/*
	 * first, we need to know whether the block is allocated already
	 * preallocated blocks are unmapped but should treated
	 * the same as allocated blocks.
	 */
1840
1841
	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
	if (ret <= 0)
1842
		return ret;
1843

1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
	map_bh(bh, inode->i_sb, map.m_pblk);
	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;

	if (buffer_unwritten(bh)) {
		/* A delayed write to unwritten bh should be marked
		 * new and mapped.  Mapped ensures that we don't do
		 * get_block multiple times when we write to the same
		 * offset and new ensures that we do proper zero out
		 * for partial write.
		 */
		set_buffer_new(bh);
1855
		set_buffer_mapped(bh);
1856
1857
	}
	return 0;
1858
}
1859

1860
1861
1862
/*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
1863
 * callback function for block_write_begin() and block_write_full_page().
1864
 * These functions should only try to map a single block at a time.
1865
1866
1867
1868
1869
 *
 * Since this function doesn't do block allocations even if the caller
 * requests it by passing in create=1, it is critically important that
 * any caller checks to make sure that any buffer heads are returned
 * by this function are either all already mapped or marked for
1870
1871
1872
 * delayed allocation before calling  block_write_full_page().  Otherwise,
 * b_blocknr could be left unitialized, and the page write functions will
 * be taken by surprise.
1873
1874
 */
static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
1875
1876
				   struct buffer_head *bh_result, int create)
{
1877
	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1878
	return _ext4_get_block(inode, iblock, bh_result, 0);
1879
1880
}

1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
	get_bh(bh);
	return 0;
}

static int bput_one(handle_t *handle, struct buffer_head *bh)
{
	put_bh(bh);
	return 0;
}

static int __ext4_journalled_writepage(struct page *page,
				       unsigned int len)
{
	struct address_space *mapping = page->mapping;
	struct inode *inode = mapping->host;
	struct buffer_head *page_bufs;
	handle_t *handle = NULL;
	int ret = 0;
	int err;

1903
	ClearPageChecked(page);
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
	page_bufs = page_buffers(page);
	BUG_ON(!page_bufs);
	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
	/* As soon as we unlock the page, it can go away, but we have
	 * references to buffers so we are safe */
	unlock_page(page);

	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}

1917
1918
	BUG_ON(!ext4_handle_valid(handle));

1919
1920
1921
1922
1923
1924
1925
	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
				do_journal_get_write_access);

	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
				write_end_fn);
	if (ret == 0)
		ret = err;
1926
	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1927
1928
1929
1930
1931
	err = ext4_journal_stop(handle);
	if (!ret)
		ret = err;

	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
1932
	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1933
1934
1935
1936
out:
	return ret;
}

1937
/*
1938
1939
1940
1941
 * Note that we don't need to start a transaction unless we're journaling data
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
Lucas De Marchi's avatar
Lucas De Marchi committed
1942
 * we are writing back data modified via mmap(), no one guarantees in which
1943
1944
1945
1946
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
 *
1947
1948
1949
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
1950
 *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
1951
 *   - grab_page_cache when doing write_begin (have journal handle)
1952
1953
1954
1955
1956
1957
1958
1959
1960
 *
 * We don't do any block allocation in this function. If we have page with
 * multiple blocks we need to write those buffer_heads that are mapped. This
 * is important for mmaped based write. So if we do with blocksize 1K
 * truncate(f, 1024);
 * a = mmap(f, 0, 4096);
 * a[0] = 'a';
 * truncate(f, 4096);
 * we have in the page first buffer_head mapped via page_mkwrite call back
1961
 * but other buffer_heads would be unmapped but dirty (dirty done via the
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
 * do_wp_page). So writepage should write the first block. If we modify
 * the mmap area beyond 1024 we will again get a page_fault and the
 * page_mkwrite callback will do the block allocation and mark the
 * buffer_heads mapped.
 *
 * We redirty the page if we have any buffer_heads that is either delay or
 * unwritten in the page.
 *
 * We can get recursively called as show below.
 *
 *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
 *		ext4_writepage()
 *
 * But since we don't do any block allocation we should not deadlock.
 * Page also have the dirty flag cleared so we don't get recurive page_lock.
1977
 */
1978
static int ext4_writepage(struct page *page,
1979
			  struct writeback_control *wbc)
1980
{
1981
	int ret = 0, commit_write = 0;
1982
	loff_t size;
1983
	unsigned int len;
1984
	struct buffer_head *page_bufs = NULL;
1985
1986
	struct inode *inode = page->mapping->host;

1987
	trace_ext4_writepage(page);
1988
1989
1990
1991
1992
	size = i_size_read(inode);
	if (page->index == size >> PAGE_CACHE_SHIFT)
		len = size & ~PAGE_CACHE_MASK;
	else
		len = PAGE_CACHE_SIZE;
1993

1994
1995
	/*
	 * If the page does not have buffers (for whatever reason),
1996
	 * try to create them using __block_write_begin.  If this
1997
1998
	 * fails, redirty the page and move on.
	 */
1999
	if (!page_has_buffers(page)) {
2000
		if (__block_write_begin(page, 0, len,
For faster browsing, not all history is shown. View entire blame