linux 文件I/O

1. 文件表
2. open
3. creat
4. close
- 4.1. 跟踪close
- 4.2. 再使用files_operations对特定类型的文件操作
5. lseek
- 5.1. lseek原型
- 5.2. lseek源码分析
6. read
7. write
8. dup
9. 文件数据的同步
10. 文件元数据
11. truncate

1 文件表

内核进程struct task_struct中有一个保存这个进程文件表的字段task_struct->files。

struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

fdtable是文件描述符表，其中一个fdtable指针是用于fdtab不够用时动态分配的。next_fd用于查找下一个文件描述符，下面的unsigned long用于保存文件描述符位图。

2 open

open在手册中有两个原型。

int open(const char *pathname, int flags);
int open(const char *pathname, int flags, mode_t mode);

内核绝对不可能为一个功能创建两个系统调用。实际上，我们调用open时，调用的是glibc封装的函数，然后由glibc进行真正的系统调用。glibc提供了一个变参函数open来满足两个函数原型。

extern int open (const char *__file, int __oflag, ...) __nonnull ((1));

pathname 表示要打开的文件路径。
flags 打开文件选项。常用的右O_RDONLY,O_WRONLY和O_RDWR。这三个选项必须有且只能右一个被指定。O_RDONLY=0,O_WRONLY=1,O_RDWR=2。除了这三个，还有更多选项。
mode 只在创建文件时需要，用于指定所创建文件的权限位，受umask环境变量的影响。

更多选项包括。

O_APPEND 每次进行写操作时，先定位到文件末尾。
O_ASYNC 使用异步I/O模式。
O_CLOEXEC 打开时就设置FD_CLOEXEC标志。
O_CREAT 当文件不存在就创建。
O_DIRECT 对该文件直接I/O，不使用VFS Cache。
O_EXCL 确保是此次调用创建的文件，与O_CREAT同时使用。文件已经存在则失败。
O_LARGEFILE 表明文件是大文件。
O_NOATIME 读取文件时不更新文件最后的访问时间。
O_NONBLOCK、O_NDELAY 设置为非阻塞。
O_SYNC 设置为I/O同步模式。每次写都将数据同步到磁盘write才返回。
O_TRUNC 打开文件时，将文件长度截断为0，与O_RDWR或O_WRONLY同时使用。写文件时，如果是作为新文件重新写入，要使用O_TRUNC标志，否则会造成旧内容依然存在的错误，如生成配置文件、pid文件等。

下面我们跟踪打开文件的过程。

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_flags op;
        /*flag是用户传递的参数，检查合法性并根据mode生成新的flags*/
        int fd = build_open_flags(flags, mode, &op);
        struct filename *tmp;

        if (fd)
                return fd;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        /*申请新的文件描述符*/
        fd = get_unused_fd_flags(flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        /*产生打开文件的通知事件*/
                        fsnotify_open(f);
                        /*将文件描述副和文件管理结构对应起来*/
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

/*do_sys_open->get_unused_fd_flags->alloc_fd(0, (flags))*/
int __alloc_fd(struct files_struct *files,
               unsigned start, unsigned end, unsigned flags)
{
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

void __fd_install(struct files_struct *files, unsigned int fd,
                struct file *file)
{
        struct fdtable *fdt;

        might_sleep();
        rcu_read_lock_sched();

        while (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                wait_event(files->resize_wait, !files->resize_in_progress);
                rcu_read_lock_sched();
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

当用户使用fd和内核交互时，内核可以用fdt->fd[fd]得到内部管理文件的结构struct file。

3 creat

由于历史原因，早期的open第二个参数只能时0,1,2。这样就没有办法打开一个不存在的文件。因此一个独立的系统调用creat被引入。现在的creat只是open的一个封装。

4 close

4.1 跟踪close

SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval = __close_fd(current->files, fd);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
                     retval == -ERESTARTNOINTR ||
                     retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK))
                retval = -EINTR;

        return retval;
}
EXPORT_SYMBOL(sys_close);

int __close_fd(struct files_struct *files, unsigned fd)
{
        struct file *file;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_unlock;
        file = fdt->fd[fd];
        if (!file)
                goto out_unlock;
        rcu_assign_pointer(fdt->fd[fd], NULL);
        __clear_close_on_exec(fd, fdt);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
        return filp_close(file, files);

out_unlock:
        spin_unlock(&files->file_lock);
        return -EBADF;
}

static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
{
        if (test_bit(fd, fdt->close_on_exec))
                __clear_bit(fd, fdt->close_on_exec);
}

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}

int filp_close(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (!file_count(filp)) {
                printk(KERN_ERR "VFS: Close: file count is 0\n");
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        fput(filp);
        return retval;
}

EXPORT_SYMBOL(filp_close);

Linux从小到大分配文件描述符号。
关闭文件描述符号时如果比next_fd小，则更新next_fd。

这个策略容易引发难以定位的bug。比如一个线程关闭了文件描述符号，然后又创建一个，这时文件描述符被复用了。如果又另一个线程保存了之前的文件描述符，那么它就可以再次访问。

4.2 再使用files_operations对特定类型的文件操作

以socket为例，说明Linux如何挂载文件系统指定的文件操作函数集合。

/*
 *      Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *      in the operation structures but are done directly via the socketcall() multiplexor.
 */
static const struct file_operations socket_file_ops = {
        .owner =        THIS_MODULE,
        .llseek =       no_llseek,
        .read_iter =    sock_read_iter,
        .write_iter =   sock_write_iter,
        .poll =         sock_poll,
        .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
#endif
        .mmap =         sock_mmap,
        .release =      sock_close,
        .fasync =       sock_fasync,
        .sendpage =     sock_sendpage,
        .splice_write = generic_splice_sendpage,
        .splice_read =  sock_splice_read,
};

利用lsof可以查看进程打开的文件，方便调试。

5 lseek

5.1 lseek原型

lseek的原型如下：

off_t lseek(int fd, off_t offset, int whence);

该函数用于将fd的文件偏移量设置为以whence为起点，偏移为offset的位置。其中whence可以为SEEK_SET，SEEK_CUR和SEEK_END，分别表示文件的起点、当前和末尾位置，而offset的值正负均可。linux3.1以后，SEEK_DATA和SEEK_HOLE被加入，用于寻找文件中的数据和空洞。

需要小心lseek的返回值，当lseek执行成功时，它返回最终以文件起始位置为起点的偏移位置。如果出错，则返回-1，同时errno被设置为对应的错误值。对于一些设备文件，允许返回负的偏移量。因此想要判断lseek是否真正出错，必须在调用lseek前将errno重置为0，然后再调用lseek，同时检查返回值及errno的值。两个同时成立才能表明lseek真正出错了。

5.2 lseek源码分析

lseek的源码位于read_write.c中。以使用default_llseek为例。

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        loff_t (*fn)(struct file *, loff_t, int);

        fn = no_llseek;
        if (file->f_mode & FMODE_LSEEK) {
                if (file->f_op->llseek)
                        fn = file->f_op->llseek;
        }
        return fn(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        off_t retval;
        struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(f.file, offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
        }
        fdput_pos(f);
        return retval;
}

6 read

linux中读文件最常用的就是read函数。

ssize_t read(int fd, void *buf, size_t count);

read尝试从fd中读取count个字节到buf中，并返回成功读取的字节数，同时将文件偏移向前移动相同的字节数。返回0的时候表示已经到达文件结尾。read还有可能读取比count小的字节数。

使用read要注意正确处理错误，也就是说read返回-1时，如果errno为EAGAIN、EWOULDBLOCK或EINTR，一般情况下都不能将其视为错误。因为前两者是由于当前fd为非阻塞且没有可读数据时返回的，后者是由于read被信号终端造成的。这两种情况都可以视为正常情况。

read正常读取的返回值可能会比count少，具体读取多少字节依照各个部分的实现而定，不可主观臆断。

下面是read相关源码。

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = *ppos;
	iov_iter_init(&iter, READ, &iov, 1, len);

	ret = filp->f_op->read_iter(&kiocb, &iter);
	BUG_ON(ret == -EIOCBQUEUED);
	*ppos = kiocb.ki_pos;
	return ret;
}

ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
		   loff_t *pos)
{
	if (file->f_op->read)
		return file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		return new_sync_read(file, buf, count, pos);
	else
		return -EINVAL;
}

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);
	if (!ret) {
		if (count > MAX_RW_COUNT)
			count =  MAX_RW_COUNT;
		ret = __vfs_read(file, buf, count, pos);
		if (ret > 0) {
			fsnotify_access(file);
			add_rchar(current, ret);
		}
		inc_syscr(current);
	}

	return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file);
		ret = vfs_read(f.file, buf, count, &pos);
		if (ret >= 0)
			file_pos_write(f.file, pos);
		fdput_pos(f);
	}
	return ret;
}

7 write

linux写文件操作常用write函数，原型如下。

ssize_t write(int fd, const void *buf, size_t count);

write尝试从buf写入count个字节到文件描述符fd，并返回成功写入的字节数，同时将文件偏移向前移动相同字节数。write有可能写入比指定count少的字节数。

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        iov_iter_init(&iter, WRITE, &iov, 1, len);

        ret = filp->f_op->write_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0)
                *ppos = kiocb.ki_pos;
        return ret;
}

ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
                    loff_t *pos)
{
        if (file->f_op->write)
                return file->f_op->write(file, p, count, pos);
        else if (file->f_op->write_iter)
                return new_sync_write(file, p, count, pos);
        else
                return -EINVAL;
}

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (!ret) {
                if (count > MAX_RW_COUNT)
                        count =  MAX_RW_COUNT;
                file_start_write(file);
                ret = __vfs_write(file, buf, count, pos);
                if (ret > 0) {
                        fsnotify_modify(file);
                        add_wchar(current, ret);
                }
                inc_syscw(current);
                file_end_write(file);
        }

        return ret;
}

static inline loff_t file_pos_read(struct file *file)
{
        return file->f_pos;
}

static inline void file_pos_write(struct file *file, loff_t pos)
{
        file->f_pos = pos;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos = file_pos_read(f.file);
                ret = vfs_write(f.file, buf, count, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
                fdput_pos(f);
        }

        return ret;
}

8 dup

Linux提供了三个复制文件描述符的系统调用。

int dup(int oldfd);
int dup2(int oldfd, int newfd);
int dup3(int oldfd, int newfd, int flags);

dup 使用一个最小的尚未使用的文件描述符作为复制后的文件描述符。
dup2 使用用户定义的newfd来复制oldfd，如果newfd已经是打开的，那么Linux先关闭newfd，然后再复制oldfd。这在daemon中重定向stdout或stderr非常有用。
dup3 只有定义了feature宏"_GNU_SOURCE"才可以使用，它比dup2多了一个参数，用于指明标志位，目前仅仅支持O_CLOEXEC。可以避免将文件内容暴露给子进程。

newld和oldfd都指向同一个文件描述结构。下面是dup的源码。

void __fd_install(struct files_struct *files, unsigned int fd,
                struct file *file)
{
        struct fdtable *fdt;

        might_sleep();
        rcu_read_lock_sched();

        while (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                wait_event(files->resize_wait, !files->resize_in_progress);
                rcu_read_lock_sched();
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
{
        __fd_install(current->files, fd, file);
}

int __alloc_fd(struct files_struct *files,
               unsigned start, unsigned end, unsigned flags)
{
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

int get_unused_fd_flags(unsigned flags)
{
        return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

9 文件数据的同步

为了提高性能，操作系统会对文件的I/O操作进行缓存处理。对于读操作，如果要读取的内容已经存在文件缓存中，就直接读取文件缓存。对于写操作，会先将修改提交到文件缓存中，在合适的时机后者过一段时间后，操作系统才将改动提交到磁盘上。Linux提供了三个接口。

void sync(void);
int fsync(int fd);
int fdatasync(int fd);

sync的实现如下。

/*
 * Sync everything. We start by waking flusher threads so that most of
 * writeback runs on all devices in parallel. Then we sync all inodes reliably
 * which effectively also waits for all flusher threads to finish doing
 * writeback. At this point all data is on disk so metadata should be stable
 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 * just write metadata (such as inodes or bitmaps) to block device page cache
 * and do not sync it on their own in ->sync_fs().
 */
SYSCALL_DEFINE0(sync)
{
        int nowait = 0, wait = 1;

        wakeup_flusher_threads(0, WB_REASON_SYNC);
        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
        iterate_bdevs(fdatawait_one_bdev, NULL);
        if (unlikely(laptop_mode))
                laptop_sync_completion();
        return 0;
}

/**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
 * @start:              offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:           perform only datasync
 *
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
 */
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = file->f_mapping->host;

        if (!file->f_op->fsync)
                return -EINVAL;
        if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_DIRTY_TIME;
                spin_unlock(&inode->i_lock);
                mark_inode_dirty_sync(inode);
        }
        return file->f_op->fsync(file, start, end, datasync);
}

与APUE描述不同，sync是阻塞调用。fsync只同步fd指定的文件，fdatasync只同步文件的实际内容，和会影响后面数据操作的元数据。而fsync不仅同步数据，还同步所有被修改过的文件元数据。因为磁盘有自己的缓存，所以sync不能保证数据被真正写到磁盘上。

10 文件元数据

文件元数据包括文件的访问权限、上次访问的时间戳、所有者、所有组、文件大小等信息。

linux提供三个接口获得文件信息。

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

int stat(const char *path, struct stat *buf);
int fstat(int fd, struct stat *buf);
int lstat(const char *path, struct stat *buf);

stat得到路径path所指定的文件信息，fstat得到文件描述符fd指定的文件信息，而当path是链接文件时，lstat得到的是链接文件自己本身的基本信息而不是其指向文件的信息。

struct kstat {
        u64             ino;
        dev_t           dev;
        umode_t         mode;
        unsigned int    nlink;
        kuid_t          uid;
        kgid_t          gid;
        dev_t           rdev;
        loff_t          size;
        struct timespec  atime;
        struct timespec mtime;
        struct timespec ctime;
        unsigned long   blksize;
        unsigned long long      blocks;
};

11 truncate

linux提供两个截断文件的函数。

#include <unistd.h>
#include <sys/types.h>

int truncate(const char *path, off_t length);
int ftruncate(int fd, off_t length);

length可以大于文件本身的大小，这时文件长度将变为length，扩充的内容被填上0。

如果需要在文件中写入新的内容，则应该使用truncate。可以在open中使用O_TRUNC，或者在打开前使用truncate，或者在打开后使用ftruncate将文件截断为0。

truncate的实现如下。

int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

long vfs_truncate(const struct path *path, loff_t length)
{
        struct inode *inode;
        long error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        error = inode_permission(inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
                error = security_path_truncate(path);
        if (!error)
                error = do_truncate(path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);
out:
        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

static long do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0) /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        struct fd f;
        int error;

        error = -EINVAL;
        if (length < 0)
                goto out;
        error = -EBADF;
        f = fdget(fd);
        if (!f.file)
                goto out;

        /* explicitly opened as large or we are on 64-bit box */
        if (f.file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = f.file->f_path.dentry;
        inode = dentry->d_inode;
        error = -EINVAL;
        if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
                goto out_putf;

        error = -EINVAL;
        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                goto out_putf;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto out_putf;

        sb_start_write(inode->i_sb);
        error = locks_verify_truncate(inode, f.file, length);
        if (!error)
                error = security_path_truncate(&f.file->f_path);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
        sb_end_write(inode->i_sb);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
        return do_sys_ftruncate(fd, length, 1);
}