当前位置：

首页
/
IT
/
程序
/
Linux块设备IO子系统(二) _页高速缓存

Linux块设备IO子系统(二) _页高速缓存

磁盘驱动就是实现磁盘空间和内存空间数据上的交互，在上一篇中我们讨论了内存端的 Page Segment Block Sector 相关的概念，本文以 3.14 内核为例，讨论这部分内存是如何被组织管理的。我们知道，为了解决 CPU 和内存的速度不匹配，计算机系统引入了 Cache 缓存机制，这种硬件 Cache 的速度接近 CPU 内部寄存器的速度，可以提高系统效率，同样的思路也适用于解决内存和磁盘的速度不匹配问题，此外，磁盘多是机械式的，从寿命的角度考虑也不适合频繁读写，所以内核就将一部分内存作为缓存，提高磁盘访问速度的同时延长磁盘寿命，这种缓存就是磁盘高速缓存。包括页高速缓存 (Page Cache，对完整数据页进行操作的磁盘高速缓存) + 目录项高速缓存 (Dentry Cache，描述文件系统路径名的目录项对象) + 索引节点高速缓存 (Buffer Cache，存放的是描述磁盘索引节点的索引节点对象)，本文主要讨论页高速缓存，有了页高速缓存，内核的代码和数据结构不必从磁盘读，也不必写入磁盘。页高速缓存可以看作特定文件系统层的一部分。

绝大多数情况下，内核在读写磁盘时都引用页高速缓存。新页被追加到页高速缓存以满足用户态进程的读请求。如果页不再高速缓存中，新页就被加到高速缓存中，然后用从磁盘读出的数据填充它，如果内存有足够的空闲空间，就让该页在高速缓存中长期保留，使其他进程再使用该页时不再访问磁盘。

同样，在把一页数据写到块设备之前，内核首先检查对应的页是否已经在高速缓存中，如果不在，就要先在其中增加一个新项，并用要写到磁盘中的数据填充该项。IO 数据的传送不是马上开始，而是延迟几秒才对磁盘进行更新，从而使进程有机会队要写入磁盘的数据做进一步的修改。

页高速缓存肯可能是下面几种之一：

含有普通文件数据的页 (上篇中的一个 Page)
含有目录的页
含有直接从块设备文件 (跳过文件系统) 读出的数据的页。
含有用户态进程数据的页
属于特殊文件系统文件的页，如 shm

从 inode 到 page

既然是建立一块磁盘空间和一块内存空间之间的关系，那么就要通过相关的结构表示这种关系，在磁盘端，存储空间本质上都是属于一个文件，Linux 中用 inode 结构表示一个文件，内存端，Linux 内核用 address_space 来组织一组内存页，所以，我们可以在 inode 结构中找到相应的 address_space 对象域，而这个文件就成为该页的所有者 (owner)。简单的追一下代码，我们可以画出下面这张关系图，本节主要围绕这张图讨论

inode

inode 是内核中描述一个文件的结构，更多关于 inode 的讨论，可以参考，本文中我们主要关心 i_mapping 和 i_data 两个成员。

//3.14/include/linux/fs.h
527 struct inode {
    541 struct address_space * i_mapping;
    594 struct address_space i_data;
    616
};

address_space

页高速缓存的核心结构就 address_space 对象，他是一个嵌入在页所有者的索引节点对象中的数据结构。高速缓存中的许多页都可能属于一个所有者，从而可能被链接到同一个 address_space 对象。该对象还在所有者的页和对这些页的操作之间建立起链接关系。

412 struct address_space {
    413 struct inode * host;
    /* owner: inode, block_device */
    414 struct radix_tree_root page_tree;
    /* radix tree of all pages */
    415 spinlock_t tree_lock;
    /* and lock protecting it */
    416 unsigned int i_mmap_writable;
    /* count VM_SHARED mappings */
    417 struct rb_root i_mmap;
    /* tree of private and shared mappings */
    418 struct list_head i_mmap_nonlinear;
    /*list VM_NONLINEAR mappings */
    419 struct mutex i_mmap_mutex;
    /* protect tree, count, list */
    420
    /* Protected by tree_lock together with the radix tree */
    421 unsigned long nrpages;
    /* number of total pages */
    422 pgoff_t writeback_index;
    /* writeback starts here */
    423 const struct address_space_operations * a_ops;
    /* methods */
    424 unsigned long flags;
    /* error bits/gfp mask */
    425 struct backing_dev_info * backing_dev_info;
    /* device readahead, etc */
    426 spinlock_t private_lock;
    /* for use by the address_space */
    427 struct list_head private_list;
    /* ditto */
    428 void * private_data;
    /* ditto */
    429
}
__attribute__((aligned(sizeof(long))));

radix_tree_root

描述一个 radix 树的根，内核使用这个数据结构快速的查找增删一个 inode 拥有的页高速缓存页

64 struct radix_tree_root {
    65 unsigned int height;
    66 gfp_t gfp_mask;
    67 struct radix_tree_node __rcu * rnode;
    68
};

50 struct radix_tree_node {
    51 unsigned int height;
    /* Height from the bottom */
    52 unsigned int count;
    53 union {
        54 struct radix_tree_node * parent;
        /* Used when ascending tree */
        55 struct rcu_head rcu_head;
        /* Used when freeing node */
        56
    };
    57 void __rcu * slots[RADIX_TREE_MAP_SIZE];
    58 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
    59
};

74 enum pageflags {
    75 PG_locked,
    /* Page is locked. Don't touch. */
    76 PG_error,
    77 PG_referenced,
    78 PG_uptodate,
    79 PG_dirty,
    80 PG_lru,
    81 PG_active,
    82 PG_slab,
    83 PG_owner_priv_1,
    /* Owner use. If pagecache, fs may use*/
    84 PG_arch_1,
    85 PG_reserved,
    86 PG_private,
    /* If pagecache, has fs-private data */
    87 PG_private_2,
    /* If pagecache, has fs aux data */
    88 PG_writeback,
    /* Page is under writeback */
    93 PG_compound,
    /* A compound page */
    95 PG_swapcache,
    /* Swap page: swp_entry_t in private */
    96 PG_mappedtodisk,
    /* Has blocks allocated on-disk */
    97 PG_reclaim,
    /* To be reclaimed asap */
    98 PG_swapbacked,
    /* Page is backed by RAM/swap */
    99 PG_unevictable,
    /* Page is "unevictable"  */
    112 __NR_PAGEFLAGS,
    113 114
    /* Filesystems */
    115 PG_checked = PG_owner_priv_1,
    116 117
    /* Two page bits are conscripted by FS-Cache to maintain local caching
118          * state.  These bits are set on pages belonging to the netfs's inodes
119          * when those inodes are being locally cached.
120          */
    121 PG_fscache = PG_private_2,
    /* page backed by cache */
    122 123
    /* XEN */
    124 PG_pinned = PG_owner_priv_1,
    125 PG_savepinned = PG_dirty,
    126 127
    /* SLOB */
    128 PG_slob_free = PG_private,
    129
};

page

page 就是内核中页描述符，通过 radix 树的操作，我们最终可以找到一组 page，这组 page 归属于一个 inode。我们可以看到其中的 pgoff_t index 成员，用来表示当前页在整组高速缓存页中的索引。至此，我们就通过一个文件的 inode 找到了它拥有的页高速缓存，接接下来就是使用块设备驱动实现相应的页缓存和磁盘的数据交互。

44 struct page {
    48 union {
        49 struct address_space * mapping;
        /* If low bit clear, points to
 57         };
 59         /* Second double word */
        60 struct {
            61 union {
                62 pgoff_t index;
                /* Our offset within mapping. */
                73
            };
            121
        };
        198
    }

页高速缓存的基本操作是增删查更，在此基础上可以封装更高级的 API

增加 page

static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)

删除 page

void delete_from_page_cache(struct page *page)

查找 page

struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,unsigned int nr_pages, struct page **pages)
//抢不到锁会阻塞
struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)   
//抢不到锁立即返回try???
//抢不到锁会阻塞，但是如果页不存在就创建新的
struct page *find_or_create_page(struct address_space *mapping,pgoff_t index, gfp_t gfp_mask)

更新 page

truct page *read_cache_page(struct address_space *mapping,pgoff_t index,int (*filler)(void *, struct page *),void *data)

从 page 到 block

block 即是那个 VFS 或文件系统中的最小逻辑操作单位，一个页高速缓存可以由几个 block 构成，他们之间的关系如下：

buffer_head

每个块缓冲区都有相应的 buffer_head 对象描述，该描述符包含内核必须了解的，有关如何处理块的所有信息，

62 struct buffer_head {
    63 unsigned long b_state;
    /* buffer state bitmap (see above) */
    64 struct buffer_head * b_this_page;
    /* circular list of page's buffers */
    65 struct page * b_page;
    /* the page this bh is mapped to */
    66 67 sector_t b_blocknr;
    /* start block number */
    68 size_t b_size;
    /* size of mapping */
    69 char * b_data;
    /* pointer to data within the page */
    70 71 struct block_device * b_bdev;
    72 bh_end_io_t * b_end_io;
    /* I/O completion */
    73 void * b_private;
    /* reserved for b_end_io */
    74 struct list_head b_assoc_buffers;
    /* associated with another mapping */
    75 struct address_space * b_assoc_map;
    /* mapping this buffer is
 76                                                    associated with */
    77 atomic_t b_count;
    /* users using this buffer_head */
    78
};

下面是 b_state 可能的取值

//include/linux/buffer_head.h
19 enum bh_state_bits {
    20 BH_Uptodate,
    /* Contains valid data */
    21 BH_Dirty,
    /* Is dirty */
    22 BH_Lock,
    /* Is locked */
    23 BH_Req,
    /* Has been submitted for I/O */
    24 BH_Uptodate_Lock,
    /* Used by the first bh in a page, to serialise
 25                           * IO completion of other buffers in the page
 26                           */
    27 28 BH_Mapped,
    /* Has a disk mapping */
    29 BH_New,
    /* Disk mapping was newly created by get_block */
    30 BH_Async_Read,
    /* Is under end_buffer_async_read I/O */
    31 BH_Async_Write,
    /* Is under end_buffer_async_write I/O */
    32 BH_Delay,
    /* Buffer is not yet allocated on disk */
    33 BH_Boundary,
    /* Block is followed by a discontiguity */
    34 BH_Write_EIO,
    /* I/O error on write */
    35 BH_Unwritten,
    /* Buffer is allocated on disk but not written */
    36 BH_Quiet,
    /* Buffer Error Prinks to be quiet */
    37 BH_Meta,
    /* Buffer contains metadata */
    38 BH_Prio,
    /* Buffer should be submitted with REQ_PRIO */
    39 BH_Defer_Completion,
    /* Defer AIO completion to workqueue */
    40 41 BH_PrivateStart,
    /* not a state bit, but the first bit available
 42                          * for private allocation by other entities
 43                          */
    44
};

增

将块设备缓冲区所在的页添加到页高速缓存中

static int grow_buffers(struct block_device *bdev, sector_t block, int size)

删

将块设备缓冲区所在的页从页高速缓存中剔除

int try_to_free_buffers(struct page *page) 
int try_to_release_page(struct page *page, gfp_t gfp_mask)

查

在页高速缓存中搜索块
当内核需要读写一个单独的物理设备块时 (例如一个超级块)，必须检查所有请求的块，缓冲区是否已经在页高速缓存中。在页高速缓存中搜索执行的块缓冲区(由块设备描述符的地址 bdev 和逻辑块号 nr 表示) 的过程可以分成 3 个步骤

获取一个指针，让它指向包含指定的块设备的 address_space 对象 (bdev->bd_inode->imapping)
获取块设备的大小 (bdev->bd_block_size)，并计算包含指定块的页索引。这需要在逻辑块号上进行移位操作，eg，如果块的大小是 1024 字节，每个缓冲区页包含 4 个块缓冲区，那么页的索引是 nr/4
在块设备的基树中搜索缓冲区页，获得页描述符后，内核访问缓冲区首部，它描述了页中块缓冲区的状态

__find_get_block(struct block_device *bdev, sector_t block, unsigned size) 
__getblk(struct block_device *bdev, sector_t block, unsigned size)    
__bread(struct block_device *bdev, sector_t block, unsigned size)

交

块缓冲区最终要提交到通用块层进行 IO 操作，相关的 API 如下

int submit_bh(int rw, struct buffer_head *bh) 
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

脏页回写

在脏页回写机制中，首先，内核使用一个 backing_dev_info 对象来描述一个 bdi 设备，即 backing device info——非易失存储设备，这些 backing_dev_info 都会挂接到 bdi_list 链表中，我们可以从其注册函数 bdi_register() 中看出。由于 bdi 的低速，所以也就有了本文讨论的页缓冲机制以及脏页回写问题，当一个高速缓存页被修改时，内核就会将相应的 page 对象中的相应的 tag 置为 PG_dirty，即" 脏页 "，脏页需要在合适的时候回写到磁盘对于脏页回写，2.6.2x/3x 以前的内核通过动态的创建 / 删除 pdflush 线程来实现脏页回写，但是 2.6.2x/3x 之后的内核对这个方面进行的改进，采用 writeback 机制进行回写，writeback 机制需要的核心结构和方法的关系如下。

可以看出，一个逻辑磁盘 -- 一个 gendisk 对象 -- 一个 request_queue 对象 -- 一个 backing_dev_info 对象，这个 backing_dev_info 对象就是脏页回写的核心结构

64 struct backing_dev_info {
    65 struct list_head bdi_list;
    97 struct bdi_writeback wb;
    /* default writeback info for this bdi */
    100 struct list_head work_list;
    102 struct device * dev;
    110
};

37
/*     
  38  * Passed into wb_writeback(), essentially a subset of writeback_control
  39  */
40 struct wb_writeback_work {
    41 long nr_pages;
    42 struct super_block * sb;
    43 unsigned long * older_than_this;
    44 enum writeback_sync_modes sync_mode;
    45 unsigned int tagged_writepages: 1;
    46 unsigned int for_kupdate: 1;
    47 unsigned int range_cyclic: 1;
    48 unsigned int for_background: 1;
    49 unsigned int for_sync: 1;
    /* sync(2) WB_SYNC_ALL writeback */
    50 enum wb_reason reason;
    /* why was writeback initiated? */
    51 52 struct list_head list;
    /* pending work list */
    53 struct completion * done;
    /* set if the caller waits */
    54
};

51 struct bdi_writeback {
    52 struct backing_dev_info * bdi;
    /* our parent bdi */
    53 unsigned int nr;
    54 55 unsigned long last_old_flush;
    /* last old data flush */
    56 57 struct delayed_work dwork;
    /* work item used for writeback */
    58 struct list_head b_dirty;
    /* dirty inodes */
    59 struct list_head b_io;
    /* parked for writeback */
    60 struct list_head b_more_io;
    /* parked for more writeback */
    61 spinlock_t list_lock;
    /* protects the b_* lists */
    62
};

 778 /*     
 779  * Explicit flushing or periodic writeback of "old" data.
 780  *
 781  * Define "old": the first time one of an inode's pages is dirtied, we mark the
 782  * dirtying-time in the inode's address_space.  So this periodic writeback code
 783  * just walks the superblock inode list, writing back any inodes which are
 784  * older than a specific point in time.
 785  *     
 786  * Try to run once per dirty_writeback_interval.  But if a writeback event
 787  * takes longer than a dirty_writeback_interval interval, then leave a
 788  * one-second gap.
 789  *      
 790  * older_than_this takes precedence over nr_to_write.  So we'll only write back
 791  * all dirty pages if they are all attached to "old" mappings.
 792  */
 793 static long wb_writeback(struct bdi_writeback *wb,
 794                          struct wb_writeback_work *work)

来源: http://www.cnblogs.com/xiaojiang1025/p/6605776.html

与本文相关文章

暂无,快来抢沙发吧！