linux 文件I/O
Table of Contents
1 文件表
内核进程struct task_struct中有一个保存这个进程文件表的字段task_struct->files。
struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; };
fdtable是文件描述符表,其中一个fdtable指针是用于fdtab不够用时动态分配的。next_fd用于查找下一个文件描述符,下面的unsigned long用于保存文件描述符位图。
2 open
open在手册中有两个原型。
int open(const char *pathname, int flags); int open(const char *pathname, int flags, mode_t mode);
内核绝对不可能为一个功能创建两个系统调用。实际上,我们调用open时,调用的是glibc封装的函数,然后由glibc进行真正的系统调用。glibc提供了一个变参函数open来满足两个函数原型。
extern int open (const char *__file, int __oflag, ...) __nonnull ((1));
- pathname 表示要打开的文件路径。
- flags 打开文件选项。常用的右O_RDONLY,O_WRONLY和O_RDWR。这三个选项必须有且只能右一个被指定。O_RDONLY=0,O_WRONLY=1,O_RDWR=2。除了这三个,还有更多选项。
- mode 只在创建文件时需要,用于指定所创建文件的权限位,受umask环境变量的影响。
更多选项包括。
- O_APPEND 每次进行写操作时,先定位到文件末尾。
- O_ASYNC 使用异步I/O模式。
- O_CLOEXEC 打开时就设置FD_CLOEXEC标志。
- O_CREAT 当文件不存在就创建。
- O_DIRECT 对该文件直接I/O,不使用VFS Cache。
- O_EXCL 确保是此次调用创建的文件,与O_CREAT同时使用。文件已经存在则失败。
- O_LARGEFILE 表明文件是大文件。
- O_NOATIME 读取文件时不更新文件最后的访问时间。
- O_NONBLOCK、O_NDELAY 设置为非阻塞。
- O_SYNC 设置为I/O同步模式。每次写都将数据同步到磁盘write才返回。
- O_TRUNC 打开文件时,将文件长度截断为0,与O_RDWR或O_WRONLY同时使用。写文件时,如果是作为新文件重新写入,要使用O_TRUNC标志,否则会造成旧内容依然存在的错误,如生成配置文件、pid文件等。
下面我们跟踪打开文件的过程。
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; /*flag是用户传递的参数,检查合法性并根据mode生成新的flags*/ int fd = build_open_flags(flags, mode, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); /*申请新的文件描述符*/ fd = get_unused_fd_flags(flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { /*产生打开文件的通知事件*/ fsnotify_open(f); /*将文件描述副和文件管理结构对应起来*/ fd_install(fd, f); } } putname(tmp); return fd; } /*do_sys_open->get_unused_fd_flags->alloc_fd(0, (flags))*/ int __alloc_fd(struct files_struct *files, unsigned start, unsigned end, unsigned flags) { unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; might_sleep(); rcu_read_lock_sched(); while (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); wait_event(files->resize_wait, !files->resize_in_progress); rcu_read_lock_sched(); } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); }
当用户使用fd和内核交互时,内核可以用fdt->fd[fd]得到内部管理文件的结构struct file。
3 creat
由于历史原因,早期的open第二个参数只能时0,1,2。这样就没有办法打开一个不存在的文件。因此一个独立的系统调用creat被引入。现在的creat只是open的一个封装。
4 close
4.1 跟踪close
SYSCALL_DEFINE1(close, unsigned int, fd) { int retval = __close_fd(current->files, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; return retval; } EXPORT_SYMBOL(sys_close); int __close_fd(struct files_struct *files, unsigned fd) { struct file *file; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_unlock; file = fdt->fd[fd]; if (!file) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); return filp_close(file, files); out_unlock: spin_unlock(&files->file_lock); return -EBADF; } static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } fput(filp); return retval; } EXPORT_SYMBOL(filp_close);
- Linux从小到大分配文件描述符号。
- 关闭文件描述符号时如果比next_fd小,则更新next_fd。
这个策略容易引发难以定位的bug。比如一个线程关闭了文件描述符号,然后又创建一个,这时文件描述符被复用了。如果又另一个线程保存了之前的文件描述符,那么它就可以再次访问。
4.2 再使用files_operations对特定类型的文件操作
以socket为例,说明Linux如何挂载文件系统指定的文件操作函数集合。
/* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, };
利用lsof可以查看进程打开的文件,方便调试。
5 lseek
5.1 lseek原型
lseek的原型如下:
off_t lseek(int fd, off_t offset, int whence);
该函数用于将fd的文件偏移量设置为以whence为起点,偏移为offset的位置。其中whence可以为SEEK_SET,SEEK_CUR和SEEK_END,分别表示文件的起点、当前和末尾位置,而offset的值正负均可。linux3.1以后,SEEK_DATA和SEEK_HOLE被加入,用于寻找文件中的数据和空洞。
需要小心lseek的返回值,当lseek执行成功时,它返回最终以文件起始位置为起点的偏移位置。如果出错,则返回-1,同时errno被设置为对应的错误值。对于一些设备文件,允许返回负的偏移量。因此想要判断lseek是否真正出错,必须在调用lseek前将errno重置为0,然后再调用lseek,同时检查返回值及errno的值。两个同时成立才能表明lseek真正出错了。
5.2 lseek源码分析
lseek的源码位于read_write.c中。以使用default_llseek为例。
loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { loff_t (*fn)(struct file *, loff_t, int); fn = no_llseek; if (file->f_mode & FMODE_LSEEK) { if (file->f_op->llseek) fn = file->f_op->llseek; } return fn(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; }
6 read
linux中读文件最常用的就是read函数。
ssize_t read(int fd, void *buf, size_t count);
read尝试从fd中读取count个字节到buf中,并返回成功读取的字节数,同时将文件偏移向前移动相同的字节数。返回0的时候表示已经到达文件结尾。read还有可能读取比count小的字节数。
使用read要注意正确处理错误,也就是说read返回-1时,如果errno为EAGAIN、EWOULDBLOCK或EINTR,一般情况下都不能将其视为错误。因为前两者是由于当前fd为非阻塞且没有可读数据时返回的,后者是由于read被信号终端造成的。这两种情况都可以视为正常情况。
read正常读取的返回值可能会比count少,具体读取多少字节依照各个部分的实现而定,不可主观臆断。
下面是read相关源码。
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { if (file->f_op->read) return file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) return new_sync_read(file, buf, count, pos); else return -EINVAL; } ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (!ret) { if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; ret = __vfs_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); } return ret; }
7 write
linux写文件操作常用write函数,原型如下。
ssize_t write(int fd, const void *buf, size_t count);
write尝试从buf写入count个字节到文件描述符fd,并返回成功写入的字节数,同时将文件偏移向前移动相同字节数。write有可能写入比指定count少的字节数。
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; } ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); else if (file->f_op->write_iter) return new_sync_write(file, p, count, pos); else return -EINVAL; } ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (!ret) { if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); ret = __vfs_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); } return ret; } static inline loff_t file_pos_read(struct file *file) { return file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { file->f_pos = pos; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); } return ret; }
8 dup
Linux提供了三个复制文件描述符的系统调用。
int dup(int oldfd); int dup2(int oldfd, int newfd); int dup3(int oldfd, int newfd, int flags);
- dup 使用一个最小的尚未使用的文件描述符作为复制后的文件描述符。
- dup2 使用用户定义的newfd来复制oldfd,如果newfd已经是打开的,那么Linux先关闭newfd,然后再复制oldfd。这在daemon中重定向stdout或stderr非常有用。
- dup3 只有定义了feature宏"_GNU_SOURCE"才可以使用,它比dup2多了一个参数,用于指明标志位,目前仅仅支持O_CLOEXEC。可以避免将文件内容暴露给子进程。
newld和oldfd都指向同一个文件描述结构。下面是dup的源码。
void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; might_sleep(); rcu_read_lock_sched(); while (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); wait_event(files->resize_wait, !files->resize_in_progress); rcu_read_lock_sched(); } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) { __fd_install(current->files, fd, file); } int __alloc_fd(struct files_struct *files, unsigned start, unsigned end, unsigned flags) { unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } int get_unused_fd_flags(unsigned flags) { return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; }
9 文件数据的同步
为了提高性能,操作系统会对文件的I/O操作进行缓存处理。对于读操作,如果要读取的内容已经存在文件缓存中,就直接读取文件缓存。对于写操作,会先将修改提交到文件缓存中,在合适的时机后者过一段时间后,操作系统才将改动提交到磁盘上。Linux提供了三个接口。
void sync(void); int fsync(int fd); int fdatasync(int fd);
sync的实现如下。
/* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably * which effectively also waits for all flusher threads to finish doing * writeback. At this point all data is on disk so metadata should be stable * and we tell filesystems to sync their metadata via ->sync_fs() calls. * Finally, we writeout all block devices because some filesystems (e.g. ext2) * just write metadata (such as inodes or bitmaps) to block device page cache * and do not sync it on their own in ->sync_fs(). */ SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; wakeup_flusher_threads(0, WB_REASON_SYNC); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); iterate_bdevs(fdatawrite_one_bdev, NULL); iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) laptop_sync_completion(); return 0; } /** * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @start: offset in bytes of the beginning of data range to sync * @end: offset in bytes of the end of data range (inclusive) * @datasync: perform only datasync * * Write back data in range @start..@end and metadata for @file to disk. If * @datasync is set only metadata needed to access modified file data is * written. */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; if (!file->f_op->fsync) return -EINVAL; if (!datasync && (inode->i_state & I_DIRTY_TIME)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); mark_inode_dirty_sync(inode); } return file->f_op->fsync(file, start, end, datasync); }
与APUE描述不同,sync是阻塞调用。fsync只同步fd指定的文件,fdatasync只同步文件的实际内容,和会影响后面数据操作的元数据。而fsync不仅同步数据,还同步所有被修改过的文件元数据。因为磁盘有自己的缓存,所以sync不能保证数据被真正写到磁盘上。
10 文件元数据
文件元数据包括文件的访问权限、上次访问的时间戳、所有者、所有组、文件大小等信息。
linux提供三个接口获得文件信息。
#include <sys/types.h> #include <sys/stat.h> #include <unistd.h> int stat(const char *path, struct stat *buf); int fstat(int fd, struct stat *buf); int lstat(const char *path, struct stat *buf);
stat得到路径path所指定的文件信息,fstat得到文件描述符fd指定的文件信息,而当path是链接文件时,lstat得到的是链接文件自己本身的基本信息而不是其指向文件的信息。
struct kstat { u64 ino; dev_t dev; umode_t mode; unsigned int nlink; kuid_t uid; kgid_t gid; dev_t rdev; loff_t size; struct timespec atime; struct timespec mtime; struct timespec ctime; unsigned long blksize; unsigned long long blocks; };
11 truncate
linux提供两个截断文件的函数。
#include <unistd.h> #include <sys/types.h> int truncate(const char *path, off_t length); int ftruncate(int fd, off_t length);
length可以大于文件本身的大小,这时文件长度将变为length,扩充的内容被填上0。
如果需要在文件中写入新的内容,则应该使用truncate。可以在open中使用O_TRUNC,或者在打开前使用truncate,或者在打开后使用ftruncate将文件截断为0。
truncate的实现如下。
int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; long error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = mnt_want_write(path->mnt); if (error) goto out; error = inode_permission(inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = locks_verify_truncate(inode, NULL, length); if (!error) error = security_path_truncate(path); if (!error) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_truncate); static long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; f = fdget(fd); if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ if (f.file->f_flags & O_LARGEFILE) small = 0; dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) goto out_putf; error = -EPERM; if (IS_APPEND(inode)) goto out_putf; sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, f.file, length); if (!error) error = security_path_truncate(&f.file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { return do_sys_ftruncate(fd, length, 1); }