生产环境中,后端有时候 rm,会出现一些问题.
这里作为一个子问题,讨论一下 rm 之后,发生的一些事.
打开 rm 源码:
[qianzichen@dev03v / src / app / coreutils / coreutils - 8.21] $ vi src / rm.c
从 main 函数开始:
int
main (int argc, char **argv)
{
...
while ((c = getopt_long (argc, argv, "dfirvIR", long_opts, NULL)) != -1)
{
switch (c)
{
case 'f':
x.interactive = RMI_NEVER;
break;
...
}
}
...
enum RM_status status = rm (file, &x);
}
首先解析命令行参数,然后调用了 rm:
enum RM_status status = rm(file, &x);
作者把 rm 函数的实现从 rm.c 中抽了出来,放在 remove.c 中:
/* Remove FILEs, honoring options specified via X.
Return RM_OK if successful. */
enum RM_status
rm (char *const *file, struct rm_options const *x)
{
enum RM_status rm_status = RM_OK;
if (*file)
{
FTS *fts = xfts_open (file, bit_flags, NULL);
while (1)
{
...
}
}
...
}
file 参数是一个只读指针数组,代表要删除的文件名列表,x 参数的结构定义如下,存储从命令行中解析后的 rm 的选项.
struct rm_options
{
/* If true, ignore nonexistent files. */
bool ignore_missing_files;
/* If true, query the user about whether to remove each file. */
enum rm_interactive interactive;
...
/* If true, recursively remove directories. */
bool recursive;
bool require_restore_cwd;
};
当 file 列表存在时,rm 调用 xfts_open:
FTS *
xfts_open (char * const *argv, int options,
int (*compar) (const FTSENT **, const FTSENT **))
{
FTS *fts = fts_open (argv, options | FTS_CWDFD, compar);
if (fts == NULL)
{
...
return fts;
}
xfts_open 返回 fts_open 的有效返回值.fts_open 的实现如下:
FTS *
fts_open (char * const *argv,
register int options,
int (*compar) (FTSENT const **, FTSENT const **))
{
register FTS *sp;
/* Options check. */
/* Allocate/initialize the stream */
/* Initialize fts_cwd_fd. */
sp->fts_cwd_fd = AT_FDCWD;
if ( ISSET(FTS_CWDFD) && ! HAVE_OPENAT_SUPPORT)
{
int fd = open (".",
O_SEARCH | (ISSET (FTS_NOATIME) ? O_NOATIME : 0));
/*
* Start out with 1K of file name space, and enough, in any case,
* to hold the user's file names.
*/
/* Allocate/initialize root's parent. */
if (*argv != NULL) {
if ((parent = fts_alloc(sp, "", 0)) == NULL)
goto mem2;
parent->fts_level = FTS_ROOTPARENTLEVEL;
}
/* Allocate/initialize root(s). */
for (root = NULL, nitems = 0; *argv != NULL; ++argv, ++nitems) {
/*
* If comparison routine supplied, traverse in sorted
* order; otherwise traverse in the order specified.
*/
if (compar) {
p->fts_link = root;
root = p;
} else {
p->fts_link = NULL;
if (root == NULL)
tmp = root = p;
else {
tmp->fts_link = p;
tmp = p;
}
}
}
if (compar && nitems > 1)
root = fts_sort(sp, root, nitems);
...
if (!ISSET(FTS_NOCHDIR) && !ISSET(FTS_CWDFD)
&& (sp->fts_rfd = diropen (sp, ".")) < 0)
SET(FTS_NOCHDIR);
i_ring_init (&sp->fts_fd_ring, -1);
return (sp);
mem3: fts_lfree(root);
...
return (NULL);
}
引用中已去除了一些 Error handling,可以看出主要是获取文件系统的一些信息,保存在 FTS 结构中,FTS 结构定义如下:
typedef struct {
struct _ftsent *fts_cur; /* current node */
int (*fts_compar) (struct _ftsent const **, struct _ftsent const **);
/* compare fn */
...
int fts_options; /* fts_open options, global flags */
struct hash_table *fts_leaf_optimization_works_ht;
union {
...
struct cycle_check_state *state;
} fts_cycle;
I_ring fts_fd_ring;
} FTS;
再回到 rm 函数,它将在一个 loop 中通过 fts_read 读取文件系统信息,并缓存在 ent 中:
rm (char *const *file, struct rm_options const *x)
{
enum RM_status rm_status = RM_OK;
if (*file)
{
FTS *fts = xfts_open (file, bit_flags, NULL);
while (1)
{
ent = fts_read (fts);
enum RM_status s = rm_fts (fts, ent, x);
}
}
...
}
ent 的结构比较大,这里不展开了.
再通过 rm_fts 对某一个 ent 进行操作,这里我们 rm 的是一个 regular file,所以控制结构会执行到 FTS_F 分支下,最终调用 execise.
static enum RM_status
rm_fts (FTS *fts, FTSENT *ent, struct rm_options const *x)
{
switch (ent->fts_info)
{
case FTS_D: /* preorder directory */
if (s == RM_OK && is_empty_directory == T_YES)
{
/* When we know (from prompt when in interactive mode)
that this is an empty directory, don't prompt twice. */
s = excise (fts, ent, x, true);
fts_skip_tree (fts, ent);
}
...
}
case FTS_F: /* regular file */
{
bool is_dir = ent->fts_info == FTS_DP || ent->fts_info == FTS_DNR;
enum RM_status s = prompt (fts, ent, is_dir, x, PA_REMOVE_DIR, NULL);
if (s != RM_OK)
return s;
return excise (fts, ent, x, is_dir);
}
...
}
}
这里再次忽略一些容错和优化,execise 最终调用了 unlinkat
static enum RM_status
excise (FTS *fts, FTSENT *ent, struct rm_options const *x, bool is_dir)
{
int flag = is_dir ? AT_REMOVEDIR : 0;
if (unlinkat (fts->fts_cwd_fd, ent->fts_accpath, flag) == 0)
{
if (x->verbose)
{
printf ((is_dir
? _("removed directory: %s\n")
...
}
return RM_OK;
}
...
}
如上我们看出,rm 最终调用了 unlinkat 这一核心函数,比如,删除 a.txt:
unlinkat(AT_FDCWD, "a.txt", 0)
用户态 rm 调用了 C 库中的 unlinkat,经查找,其声明是在中
#ifdef __USE_ATFILE
/* Remove the link NAME relative to FD. */
extern int unlinkat (int __fd, const char *__name, int __flag)
__THROW __nonnull ((2));
#endif
/* Remove the directory PATH. */
extern int rmdir (const char *__path) __THROW __nonnull ((1));
用户态进程只要调用 unlink 函数就可以了,具体 unlinkat 函数的实现是由 glibc
提供的,其定义在 io/unlink.c 中:
* Remove the link named NAME. * /
int
__unlink (name)
const char *name;
{
if (name == NULL)
{
__set_errno (EINVAL);
return -1;
}
__set_errno (ENOSYS);
return -1;
}
stub_warning (unlink)
weak_alias (__unlink, unlink)/
额好吧,这儿是个弱符号,真正的实现在./sysdeps/unix/sysv/linux/unlinkat.c
...
/* Remove the link named NAME. */
int
unlinkat (fd, file, flag)
int fd;
const char *file;
int flag;
{
int result;
#ifdef __NR_unlinkat
# ifndef __ASSUME_ATFCTS
if (__have_atfcts >= 0)
# endif
{
result = INLINE_SYSCALL (unlinkat, 3, fd, file, flag);
# ifndef __ASSUME_ATFCTS
if (result == -1 && errno == ENOSYS)
__have_atfcts = -1;
else
# endif
return result;
}
char *buf = NULL;
}
...
INTERNAL_SYSCALL_DECL (err);
if (flag & AT_REMOVEDIR)
result = INTERNAL_SYSCALL (rmdir, err, 1, file);
else
result = INTERNAL_SYSCALL (unlink, err, 1, file);
...
}
syscall 的 name 为_NR##name,通过宏中字符串粘合而得本例中的__NR_unlinkat.其定义在 / usr/include/asm/unistd_64.h 中.
#ifndef _ASM_X86_UNISTD_64_H
#define _ASM_X86_UNISTD_64_H 1
#define __NR_read 0
#define __NR_write 1
...
#define __NR_newfstatat 262
#define __NR_unlinkat 263
...
#define __NR_kexec_file_load 320
#define __NR_userfaultfd 323
#endif /* _ASM_X86_UNISTD_64_H */
所以该宏被启用.
/* The *at syscalls were introduced just after 2.6.16-rc1. Due to the way the
kernel versions are advertised we can only rely on 2.6.17 to have
the code. On PPC they were introduced in 2.6.17-rc1,
on SH in 2.6.19-rc1. */
#if __LINUX_KERNEL_VERSION >= 0x020611 \
&& (!defined __sh__ || __LINUX_KERNEL_VERSION >= 0x020613)
# define __ASSUME_ATFCTS 1
#endif
显然可以看出,若 kernel 版本在 2.6.17 之后,__ASSUME_ATFCTS 宏被启用.无需校验__have_atfcts >= 0,直接调用 INLINE_SYSCALL (unlinkat, 3, fd, file, flag).
这里直接看底层实现吧(./sysdeps/unix/sysv/linux/x86_64/sysdep.h),是一段内联汇编:
# undef INLINE_SYSCALL_TYPES
# define INLINE_SYSCALL_TYPES(name, nr, args...) \
({ \
unsigned long int resultvar = INTERNAL_SYSCALL_TYPES (name, , nr, args); \
if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0)) \
{ \
__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, )); \
resultvar = (unsigned long int) -1; \
} \
(long int) resultvar; })
# undef INTERNAL_SYSCALL_DECL
# define INTERNAL_SYSCALL_DECL(err) do { } while (0)
# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
({ \
unsigned long int resultvar; \
LOAD_ARGS_##nr (args) \
LOAD_REGS_##nr \
asm volatile ( \
"syscall\n\t" \
: "=a" (resultvar) \
: "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx"); \
(long int) resultvar; })
# undef INTERNAL_SYSCALL
# define INTERNAL_SYSCALL(name, err, nr, args...) \
INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)
# define INTERNAL_SYSCALL_NCS_TYPES(name, err, nr, args...) \
在 syscall 之前先将参数传入寄存器.返回值在 eax 寄存器中,通常 0 表示成功.
从 C 库代码上来看,就是这么实现了的,rm 实用程序调用 glibc,然后再到汇编 syscall -> kernel
但是当前机器安装的不一定是 upstream 的 C 库.
我们还是来亲眼看一下最终机器码是如何实现的吧,我这里直接反汇编一下:
[qianzichen@dev03v /usr/lib64]$ objdump -D -S libc.so.6 > /tmp/libc.txt
[qianzichen@dev03v /usr/lib64]$ cd /tmp
[qianzichen@dev03v /tmp]$ grep -A12 'unlinkat' libc.txt
00000000000e9c00 <unlinkat>:
e9c00: 48 63 d2 movslq %edx,%rdx
e9c03: 48 63 ff movslq %edi,%rdi
e9c06: b8 07 01 00 00 mov $0x107,%eax
e9c0b: 0f 05 syscall
e9c0d: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax
e9c13: 77 02 ja e9c17 <unlinkat+0x17>
e9c15: f3 c3 repz retq
e9c17: 48 8b 15 4a 12 2d 00 mov 0x2d124a(%rip),%rdx # 3bae68 <_DYNAMIC+0x2e8>
e9c1e: f7 d8 neg %eax
e9c20: 64 89 02 mov %eax,%fs:(%rdx)
e9c23: 48 83 c8 ff or $0xffffffffffffffff,%rax
e9c27: c3 retq
e9c28: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
e9c2f: 00
00000000000e9c30 <rmdir>:
e9c30: b8 54 00 00 00 mov $0x54,%eax
e9c35: 0f 05 syscall
[qianzichen@dev03v /tmp]$
这里可以看到 glibc-2.17 最终使用了一些 AT&T syntax Assembly language.
先用一个比较新的指令 movslq,把第一个寄存器扩展到 64 位并复制到第二个寄存器中,不填充符号位.
下一步,将 0x107 这个值载入 eax 寄存器
随后,调用 syscall 指令.
打开 Intel 的相关芯片手册,搜索 "syscall",找到相关描述如下图.
datasheet 中关于 syscall 的描述
从这段描述中看出,syscall 是 Intel 对 64 位处理器做的优化,被设计用来为操作系统提供一个平面内存模式,我的当前 64 位机器,syscall/sysret 就和 32 位体系上的 sysenter/sysexit 的作用相似,可能和旧平台的 int 80 中断类似,主要是将 CPU 运行级别从 level 3 升级为 level 0,操作一些应用层无法访问的资源.
从 "Use CPUID to check if SYSCALL and SYSRET are available (CPUID.80000001H.EDX[bit 11] = 1)" 这一句可以看出,在调用前需要置 edx 寄存器中的 11 位来使能 64 位平台的 syscall/sysret,好的我们找出 edx 寄存器相关.
edx 寄存器相关
之前操作 edx 寄存器,就是 "使能 bit 11 位和 bit 29" 这种准备工作,写入 eax 的值暂时不知道是什么作用.
我们确定了,unlinkat 是一个 system call,rm 实用程序将删除文件的任务交给操作系统,至此程序陷入内核态.
好的,我们现在到 kernel 下,直接搜索 unlinkat:
[qianzichen@dev03v /src/linux/linux]$ grep unlinkat ./ -rn
./arch/parisc/include/uapi/asm/unistd.h:297:#define __NR_unlinkat (__NR_Linux + 281)
./arch/parisc/kernel/syscall_table.S:379: ENTRY_SAME(unlinkat)
./arch/m32r/include/uapi/asm/unistd.h:309:#define __NR_unlinkat 301
./arch/m32r/kernel/syscall_table.S:303: .long sys_unlinkat
./arch/sparc/include/uapi/asm/unistd.h:358:#define __NR_unlinkat 290
./arch/sparc/kernel/systbls_32.S:78:/*290*/ .long sys_unlinkat,
./arch/ia64/include/uapi/asm/unistd.h:279:#define __NR_unlinkat 1287
./arch/ia64/kernel/entry.S:1695: data8 sys_unlinkat
./arch/ia64/kernel/fsys.S:815: data8 0 // unlinkat
./arch/alpha/include/uapi/asm/unistd.h:420:#define __NR_unlinkat 456
./arch/alpha/kernel/systbls.S:477: .quad sys_unlinkat
...
./arch/x86/entry/syscalls/syscall_32.tbl:310:301 i386 unlinkat sys_unlinkat
./arch/x86/entry/syscalls/syscall_64.tbl:272:263 common unlinkat sys_unlinkat
...
[qianzichen@dev03v /src/linux/linux]$
直接看 x86 体系下的源码:
[qianzichen@dev03v / src / linux / linux] $ vi arch / x86 / entry / syscalls / syscall_64.tbl
这是一个列表文件,
#
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The abi is "common", "64" or "x32" for this file.
#
0 common read sys_read
...
261 common futimesat sys_futimesat
262 common newfstatat sys_newfstatat
263 common unlinkat sys_unlinkat
264 common renameat sys_renameat
265 common linkat sys_linkat
...
#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
#
512 x32 rt_sigaction compat_sys_rt_sigaction
...
这里看出,unlinkat 对应的 number 是 263
还记得写入 eax 寄存器中的值吗,是 0x107.
很显然,0x107 = 1 * 16 ^ 2 + 0 * 16 ^ 1 + 7 * 16 ^ 0 = 263
common 代表 32/64 位平台通用
user space 和 kernel space 的 system call 映射建立.
其实 kernel space 对编号的映射不是这么简单,这里不再展开.
我们大概知道 user space 的 unlinkat 最终在 kernel space 的 entry point 是 sys_unlinkat 就好了.
还是直接查看汇编代码吧:
[qianzichen@dev03v / src / linux / linux] $ vi arch / x86 / entry / entry_64.S
...
ENTRY(entry_SYSCALL_64)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
TRACE_IRQS_OFF
/* Construct struct pt_regs on stack */
pushq $__USER_DS
...
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx
/*
* This call instruction is handled specially in stub_ptregs_64.
* It might end up jumping to the slow path. If it jumps, RAX
* and all argument registers are clobbered.
*/
call *sys_call_table(, %rax, 8)
...
END(entry_SYSCALL_64)
rax 中存的就是这次 syscall 的 num,即__NR_unlinkat.
ENTRY(entry_SYSCALL_64) 是 64 位的 syscall 汇编入口点,在准备一系列寄存器之后,call *sys_call_table(, %rax, 8) 将跳转到系统调用表中的偏移地址,也就是 sys_call_table 数组中下标为 syscall num 对应的函数.
sys_call_table 在另一个文件中定义,这里用到了一点编译器扩展和预编译技术的一种高效用法,这里也不再展开.
/* System call table for x86-64. */
...
#define __SYSCALL_64_QUAL_(sym) sym
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
什么时候建立 syscall number 和 sys_unlinkat 的映射呢?这要看,这个头文件是一个过程文件,在编译时生成.原映射信息就是从上文提到的./arch/x86/entry/syscalls/syscall_64.tbl 中获得.
编译出来的 syscalls_64.h 结果为:
__SYSCALL_COMMON(49, sys_bind, sys_bind)
__SYSCALL_COMMON(50, sys_listen, sys_listen)
...
__SYSCALL_COMMON(263, sys_unlinkat, sys_unlinkat)
__SYSCALL_COMMON 就是__SYSCALL_64,如上文述 sys_call_table 的定义,第一个__SYSCALL_64 的定义是为了将 syscalls_64.h 展开为函数声明,之后将__SYSCALL_64 重新定义后,是为了将 syscalls_64.h 展开为数组成员的定义.
所以最终内核得到的,是一个只读的 sys_call_table 数组,下标为 syscall number,指向的是内核的 sys_call_ptr_t.syscall num 从 0 开始,所以直接根据 263 就可以找到 sys_unlinkat.
现在内核已经确定了要调用的是 sys_unlinkat,那么这个函数在哪里定义的呢?经过我的一番尝试,4.9 中直接找 sys_unlinkat 是找不到实现的,因为这个字符串可能经过预编译粘合.
我最终找到的宏是这样定义的:
...
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(SyS##name)))); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
...
然后找到,sys_unlinkat 的代码在 fs/namei.c 中:
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
if ((flag & ~AT_REMOVEDIR) != 0)
return -EINVAL;
if (flag & AT_REMOVEDIR)
return do_rmdir(dfd, pathname);
return do_unlinkat(dfd, pathname);
}
然后调用 do_unlinkat:
/*
* Make sure that the actual truncation of the file will occur outside its
* directory's i_mutex. Truncate can take a long time if there is a lot of
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
static long do_unlinkat(int dfd, const char __user *pathname)
{
int error;
struct filename *name;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
struct inode *inode = NULL;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
name = filename_parentat(dfd, getname(pathname), lookup_flags,
&path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
error = -EISDIR;
if (type != LAST_NORM)
goto exit1;
error = mnt_want_write(path.mnt);
if (error)
goto exit1;
retry_deleg:
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
if (last.name[last.len])
goto slashes;
de = dentry->d_inode;
if (d_is_negative(dentry))
goto slashes;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
goto exit2;
error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
inode_unlock(path.dentry->d_inode);
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
return error;
slashes:
if (d_is_negative(dentry))
error = -ENOENT;
else if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
goto exit2;
}
好了,读者随着我到这一步,已经看到了软件工程中比较具有美感的一个地方:4044 行,调用了 vfs_unlink.从 user space 到 system call 再至此,sys_unlinkat 将 unlinkat 的任务,dispatch 给操作系统的虚拟文件系统.
我们看一下 vfs_unlink 的实现:
/**
* vfs_unlink - unlink a filesystem object
* @dir: parent directory
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
*
* The caller must hold dir->i_mutex.
*
* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
* return a reference to the inode in delegated_inode. The caller
* should then break the delegation on that inode and retry. Because
* breaking a delegation may take a long time, the caller should drop
* dir->i_mutex before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
int vfs_unlink(struct inode * dir, struct dentry * dentry, struct inode * *delegated_inode) {
struct inode * target = dentry - >d_inode;
int error = may_delete(dir, dentry, 0);
if (error) return error;
if (!dir - >i_op - >unlink) return - EPERM;
inode_lock(target);
if (is_local_mountpoint(dentry)) error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
if (!error) {
error = try_break_deleg(target, delegated_inode);
if (error) goto out;
error = dir - >i_op - >unlink(dir, dentry);
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
}
}
out: inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && !(dentry - >d_flags & DCACHE_NFSFS_RENAMED)) {
fsnotify_link_count(target);
d_delete(dentry);
}
return error;
}
EXPORT_SYMBOL(vfs_unlink);
我们看到,3979 行,调用 inode 实例中 i_op 成员的 unlink 函数指针,这个指针才指向了真正的 HAL 层实现.
现在看 inode 结构的定义:
/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
*/
struct inode {
umode_t i_mode;
...
const struct inode_operations *i_op;
struct super_block *i_sb;
/* Stat data, not accessed from path walking */
unsigned long i_ino;
...
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
struct fscrypt_info *i_crypt_info;
#endif
void *i_private; /* fs or device private pointer */
};
可以看到上文的 inode 实例中的 i_op 成员是一个 inode_operations 结构指针.
现在看 inode_operations 的定义:
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
...
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
...
} ____cacheline_aligned;
vfs 下层的各种文件系统,需要按照 inode_operations 中的规范,完成 unlink 的实现,向 kernel vfs 注册.
这里不展开 bootloader 自举之后的硬件初始化,也忽略 kernel 接管机器资源之后的一些 register 机制,直接看当前机器是怎么向 vfs 最终注册.
看了一下,我机器上挂载的是 ext4 文件系统,直接看 ext4 的 unlink 的最终注册过程:
...
3845 /*
3846 * directories can handle most operations...
3847 */
3848 const struct inode_operations ext4_dir_inode_operations = {
...
3851 .link = ext4_link,
3852 .unlink = ext4_unlink,
3853 .symlink = ext4_symlink,
...
3865 }
ext4_dir_inode_operations 实例中,完成了函数指针的赋值.
直接看 ext4_unlink 的实现:
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
retval = dquot_initialize(dir);
if (retval)
return retval;
retval = dquot_initialize(d_inode(dentry));
if (retval)
return retval;
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
goto end_unlink;
inode = d_inode(dentry);
retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
handle = NULL;
goto end_unlink;
}
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
if (inode->i_nlink == 0) {
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
dentry->d_name.len, dentry->d_name.name);
set_nlink(inode, 1);
}
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
end_unlink:
brelse(bh);
if (handle)
ext4_journal_stop(handle);
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
看 d_inode 的实现:
static inline struct inode *d_inode(const struct dentry *dentry)
{
return dentry->d_inode;
}
d_inode(dentry) 将 inode 信息从 dentry 结构中取出来,dentry 结构定义如下:
struct dentry {
/* RCU lookup touched fields */
...
struct qstr d_name;
struct inode *d_inode; /* Where the name belongs to - NULL is
...
union {
struct hlist_node d_alias; /* inode alias list */
struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
struct rcu_head d_rcu;
} d_u;
};
dentry 这一层,不是简单的从硬盘中移除.为了高性能,当前 ext4 对目录做了一些缓存处理.应该是先设置标志位,然后根据 sync 机制回写存储.
vfs 之下的机制就先不详述了,因为我也不太清楚,蛤蛤.
来源: http://www.jianshu.com/p/d8a0b753f949