Linux kernel: VFS

前言

虚拟文件系统,也可以称之为虚拟文件系统转换,是一个用来处理与Unix标准文件系统相关的所有系统调用的内核软件层。

VFS的作用

cp为例,可以看到下面的代码并没有与文件系统的交互,而是从read中读取到buf缓冲,然后使用full_write写入。

static bool
sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size,
             size_t hole_size, bool punch_holes,
             char const *src_name, char const *dst_name,
             uintmax_t max_n_read, off_t *total_n_read,
             bool *last_write_made_hole)
{
  *last_write_made_hole = false;
  *total_n_read = 0;
  bool make_hole = false;
  off_t psize = 0;

  while (max_n_read)
    {
      ssize_t n_read = read (src_fd, buf, MIN (max_n_read, buf_size));
      if (n_read < 0)
        {
          if (errno == EINTR)
            continue;
          error (0, errno, _("error reading %s"), quoteaf (src_name));
          return false;
        }
      if (n_read == 0)
        break;
      max_n_read -= n_read;
      *total_n_read += n_read;

      /* Loop over the input buffer in chunks of hole_size.  */
      size_t csize = hole_size ? hole_size : buf_size;
      char *cbuf = buf;
      char *pbuf = buf;

      while (n_read)
        {
          bool prev_hole = make_hole;
          csize = MIN (csize, n_read);

          if (hole_size && csize)
            make_hole = is_nul (cbuf, csize);

          bool transition = (make_hole != prev_hole) && psize;
          bool last_chunk = (n_read == csize && ! make_hole) || ! csize;

          if (transition || last_chunk)
            {
              if (! transition)
                psize += csize;

              if (! prev_hole)
                {
                  if (full_write (dest_fd, pbuf, psize) != psize)
                    {
                      error (0, errno, _("error writing %s"),
                             quoteaf (dst_name));
                      return false;
                    }
                }
              else
                {
                  if (! create_hole (dest_fd, dst_name, punch_holes, psize))
                    return false;
                }

              pbuf = cbuf;
              psize = csize;

              if (last_chunk)
                {
                  if (! csize)
                    n_read = 0; /* Finished processing buffer.  */

                  if (transition)
                    csize = 0;  /* Loop again to deal with last chunk.  */
                  else
                    psize = 0;  /* Reset for next read loop.  */
                }
            }
          else  /* Coalesce writes/seeks.  */
            {
              if (INT_ADD_WRAPV (psize, csize, &psize))
                {
                  error (0, 0, _("overflow reading %s"), quoteaf (src_name));
                  return false;
                }
            }

          n_read -= csize;
          cbuf += csize;
        }

      *last_write_made_hole = make_hole;

      /* It's tempting to break early here upon a short read from
         a regular file.  That would save the final read syscall
         for each file.  Unfortunately that doesn't work for
         certain files in /proc or /sys with linux kernels.  */
    }

  /* Ensure a trailing hole is created, so that subsequent
     calls of sparse_copy() start at the correct offset.  */
  if (make_hole && ! create_hole (dest_fd, dst_name, punch_holes, psize))
    return false;
  else
    return true;
}

对于cp /mnt/ext4_disk/filea /tmp/fileb,cp命令直接与VFS交互,filea位于ext4文件系统上,fileb位于挂载到/tmp的tmpfs上,如下图所示:

flowchart TB
    cp(cp) --> vfs[vfs] --> tmpfs --> fileb
    filea --> ext4 --> vfs
    vfs[vfs] --> cp(cp)

通用文件模型

通用文件模型能够表示所有支持的文件系统,由下列对象类型组成:

  • 超级块对象(superblock object):存放已安装文件系统的有关信息

  • 索引节点对象(inode object):存放关于具体文件的一般信息,每个索引节点对象都有一个索引节点号,唯一标识文件系统中的文件

  • 文件对象(file object):存放打开文件与进程之间交互的有关信息

  • 目录项对象(dentry objdect):目录项(文件的特定名称与对应文件进行链接的有关信息)

下图为一个简单的实例,有三个进程打开了同一个文件,但是只需要两个目录项对象:

flowchart TB
    进程1 ---> fsobj_1(文件对象) ---> |目录项高速缓存|dentry_obj_1
    进程2 ---> fsobj_2(文件对象) ---> dentry_obj_1
    进程3 ---> fsobj_3(文件对象) ---> dentry_obj_2
    subgraph 目录项
        dentry_obj_1(目录项对象)
        dentry_obj_2(目录项对象)
    end
        dentry_obj_1(目录项对象) ---> 索引节点对象 ---> 磁盘文件
        dentry_obj_2(目录项对象) ---> 索引节点对象 ---> |超级块对象|磁盘文件

VFS的文件系统类型

  • 磁盘文件系统
    • Linux
      • Ext2
      • Ext4(包括Ext3)
      • Reiserfs
    • Oracle
      • Btrfs
    • Samsung
      • F2FS
    • SGI
      • XFS
    • MicroSoft
      • MS-DOS
      • VFAT
      • NTFS(NTFS3驱动,需要5.15及以上)
    • ...
  • 网络文件系统
    • NFS
    • SMB/CIFS(微软的网络文件系统,ksmbd已并入内核)
    • Ceph
    • Lustre(4.18后移除)
    • ...
  • 特殊文件系统
    • autofs 实现文件系统的自动挂载
    • bdev 块设备
    • binfmt-misc 其它可执行程序,可挂载至任意位置
    • configfs 实现对内核的配置,挂载至/sys/kernel/config
    • debugfs 调试用
    • devtmpfs 在 Linux 核心 启动早期建立一个初步的 /dev,令一般启动程序不用等待 udev,缩短 GNU/Linux 的开机时间
    • devpts 伪终端,挂载至/dev/pts
    • efivarfs UEFI变量,存储在BIOS中
    • eventpollfs 有效事件轮询
    • futexfs 快速用户空间加锁机制futex使用
    • hugetlbfs 实现大页面支持
    • pipefs 管道
    • proc 对内核数据结构的常规访问点,挂载至/proc
    • rootfs 为启动阶段提供的一个空的根目录
    • shm IPC共享线性区
    • mqueue 实现POSIX消息队列,可挂载至任意位置
    • sockfs/socketfs 套接字
    • sysfs 对系统数据的常规访问点,挂载至/sys
    • tmpfs 临时文件,可挂载至任意位置
    • usbfs USB设备,挂载至/proc/bus/usb
    • ...

VFS处理的系统调用

VFS的系统调用设计文件系统,普通文件,目录文件以及符号链接文件,还有涉及到设备文件和管道文件的系统调用,最后还有一些用于实现网络功能的系统调用。

VFS的数据结构及操作

一些基础的数据结构以及对应的操作。

由于VFS是一层很高级的抽象,会包含所有的数据结构与操作,只有在特定的模块里面才会有对应的实现,其它都置为NULL,加载时模块载入链接。

超级块对象

struct super_block {
    //指向超级块链表的指针
    struct list_head    s_list;     /* Keep this first */
    // 设备标识符
    dev_t           s_dev;      /* search index; _not_ kdev_t */
    // 块大小,单位是位
    unsigned char       s_blocksize_bits;
    // 块大小,单位是字节
    unsigned long       s_blocksize;
    // 文件的最长长度
    loff_t          s_maxbytes; /* Max file size */
    // 文件系统类型
    struct file_system_type *s_type;
    // 超级块方法
    const struct super_operations   *s_op;
    // 磁盘限额处理方法
    const struct dquot_operations   *dq_op;
    // 磁盘限额管理方法
    const struct quotactl_ops   *s_qcop;
    // 网络文件系统的输出操作
    const struct export_operations *s_export_op;
    // 安装标志
    unsigned long       s_flags;
    // cgroup write back中blkcfg用于控制bio(bio)流量的标志
    unsigned long       s_iflags;   /* internal SB_I_* flags */
    // 文件系统的魔法数字
    unsigned long       s_magic;
    // 文件系统根目录的目录项对象
    struct dentry       *s_root;
    // 卸载所用的信号量
    struct rw_semaphore s_umount;
    // 引用计数器
    int         s_count;
    // 次级引用计数器
    atomic_t        s_active;
    // 指向超级块安全数据结构
#ifdef CONFIG_SECURITY
    void                    *s_security;
#endif
    // 指向超级块扩展属性
    const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
    // 超级块文件系统的加密操作
    const struct fscrypt_operations *s_cop;
    // 主加密密钥
    struct key      *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
    // 文件系统验证操作
    const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
    // 字符集编码
    struct unicode_map *s_encoding;
    // 字符集编码标志
    __u16 s_encoding_flags;
#endif
    // 网络文件系统的可选根目录的目录项对象
    struct hlist_bl_head    s_roots;    /* alternate root dentries for NFS */
    // 挂载列表
    struct list_head    s_mounts;   /* list of mounts; _not_ for fs use */
    // 指向块设备驱动程序
    struct block_device *s_bdev;
    // 指向块设备驱动程序信息
    struct backing_dev_info *s_bdi;
    // MTD(Memory Technology Device),NAND或者NOR Flash设备信息
    struct mtd_info     *s_mtd;
    // 用于给定文件系统类型的超级块对象链表的指针
    struct hlist_node   s_instances;
    // 磁盘配额的类型
    unsigned int        s_quota_types;  /* Bitmask of supported quota types */
    // 磁盘配额的操作
    struct quota_info   s_dquot;    /* Diskquota specific options */
    // 超级块的写入锁,包括每个CPU的信号量
    struct sb_writers   s_writers;

    /*
     * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
     * s_fsnotify_marks together for cache efficiency. They are frequently
     * accessed and rarely modified.
     */
    // 指向特定文件系统的超级块信息的指针
    void            *s_fs_info; /* Filesystem private info */

    /* Granularity of c/m/atime in ns (cannot be worse than a second) */
    // 时间戳粒度,纳秒级
    u32         s_time_gran;
    /* Time limits for c/m/atime in seconds */
    // 时间限制
    time64_t           s_time_min;
    time64_t           s_time_max;
#ifdef CONFIG_FSNOTIFY
    __u32           s_fsnotify_mask;
    struct fsnotify_mark_connector __rcu    *s_fsnotify_marks;
#endif
    // 包含超级块的块设备名称
    char            s_id[32];   /* Informational name */
    // 文件系统上卷的UUID
    uuid_t          s_uuid;     /* UUID */
    // 每个文件的最大链接数量
    unsigned int        s_max_links;
    // 文件的模式,bit掩码
    fmode_t         s_mode;

    /*
     * The next field is for VFS *only*. No filesystems have any business
     * even looking at it. You had been warned.
     */
    // 虚拟文件系统重命名互斥锁
    struct mutex s_vfs_rename_mutex;    /* Kludge */

    /*
     * Filesystem subtype.  If non-empty the filesystem type field
     * in /proc/mounts will be "type.subtype"
     */
    // 文件系统亚型
    const char *s_subtype;
    // 目录项默认操作
    const struct dentry_operations *s_d_op; /* default d_op for dentries */

    /*
     * Saved pool identifier for cleancache (-1 means none)
     */
    int cleancache_poolid;
    // 回收器
    struct shrinker s_shrink;   /* per-sb shrinker handle */

    /* Number of inodes with nlink == 0 but still referenced */
    atomic_long_t s_remove_count;

    /*
     * Number of inode/mount/sb objects that are being watched, note that
     * inodes objects are currently double-accounted.
     */
    atomic_long_t s_fsnotify_connectors;

    /* Being remounted read-only */
    int s_readonly_remount;

    /* per-sb errseq_t for reporting writeback errors via syncfs */
    errseq_t s_wb_err;

    /* AIO completions deferred from interrupt context */
    struct workqueue_struct *s_dio_done_wq;
    struct hlist_head s_pins;

    /*
     * Owning user namespace and default context in which to
     * interpret filesystem uids, gids, quotas, device nodes,
     * xattrs and security labels.
     */
    struct user_namespace *s_user_ns;

    /*
     * The list_lru structure is essentially just a pointer to a table
     * of per-node lru lists, each of which has its own spinlock.
     * There is no need to put them into separate cachelines.
     */
    struct list_lru     s_dentry_lru;
    struct list_lru     s_inode_lru;
    struct rcu_head     rcu;
    struct work_struct  destroy_work;

    struct mutex        s_sync_lock;    /* sync serialisation lock */

    /*
     * Indicates how deep in a filesystem stack this SB is
     */
    int s_stack_depth;

    /* s_inode_list_lock protects s_inodes */
    spinlock_t      s_inode_list_lock ____cacheline_aligned_in_smp;
    struct list_head    s_inodes;   /* all inodes */

    spinlock_t      s_inode_wblist_lock;
    struct list_head    s_inodes_wb;    /* writeback inodes */
} __randomize_layout;

超级块方法

struct super_operations {
    // 为索引节点对象分配空间
    struct inode *(*alloc_inode)(struct super_block *sb);
    // 撤销索引节点对象
    void (*destroy_inode)(struct inode *);
    // 释放内存中slab中分配的索引节点对象缓存
    void (*free_inode)(struct inode *);
    // 
    void (*dirty_inode) (struct inode *, int flags);
    int (*write_inode) (struct inode *, struct writeback_control *wbc);
    // 撤销索引节点时调用
    int (*drop_inode) (struct inode *);
    // iput()的最后如果链接数量为零时调用,撤销全部节点
    void (*evict_inode) (struct inode *);
    // 释放超级块对象
    void (*put_super) (struct super_block *);
    // 清除文件系统来更新磁盘上的具体文件系统数据结构时调用
    int (*sync_fs)(struct super_block *sb, int wait);
    // 锁定文件系统保持一致性
    int (*freeze_super) (struct super_block *);
    int (*freeze_fs) (struct super_block *);
    // 解锁文件系统
    int (*thaw_super) (struct super_block *);
    int (*unfreeze_fs) (struct super_block *);
    // 返回文件系统得统计信息
    int (*statfs) (struct dentry *, struct kstatfs *);
    // 重新挂载文件系统
    int (*remount_fs) (struct super_block *, int *, char *);
    // 卸载
    void (*umount_begin) (struct super_block *);
    // 显示文件系统的选项,设备名,路径和状态
    int (*show_options)(struct seq_file *, struct dentry *);
    int (*show_devname)(struct seq_file *, struct dentry *);
    int (*show_path)(struct seq_file *, struct dentry *);
    int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
    // 限额读写
    ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
    ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
    // 获取磁盘限额
    struct dquot **(*get_dquots)(struct inode *);
#endif
    // 由超级块缓存压缩调用,返回可释放的缓存
    long (*nr_cached_objects)(struct super_block *,
                  struct shrink_control *);
    // 由超级块缓存压缩调用,扫描对象并尝试释放
    long (*free_cached_objects)(struct super_block *,
                    struct shrink_control *);
};

长时间迭代后,一些方法已经消失了,例如generic_delete_inode已经不接管最后的撤销索引节点了,这个工作交给了iput_final(struct inode *inode)来调用drop_inode和最后的evict方法撤销所有的索引节点。

int generic_delete_inode(struct inode *inode)
{
    return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
    struct super_block *sb = inode->i_sb;
    const struct super_operations *op = inode->i_sb->s_op;
    unsigned long state;
    int drop;

    WARN_ON(inode->i_state & I_NEW);

    if (op->drop_inode)
        drop = op->drop_inode(inode);
    else
        drop = generic_drop_inode(inode);

    if (!drop &&
        !(inode->i_state & I_DONTCACHE) &&
        (sb->s_flags & SB_ACTIVE)) {
        inode_add_lru(inode);
        spin_unlock(&inode->i_lock);
        return;
    }

    state = inode->i_state;
    if (!drop) {
        WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
        spin_unlock(&inode->i_lock);

        write_inode_now(inode, 1);

        spin_lock(&inode->i_lock);
        state = inode->i_state;
        WARN_ON(state & I_NEW);
        state &= ~I_WILL_FREE;
    }

    WRITE_ONCE(inode->i_state, state | I_FREEING);
    if (!list_empty(&inode->i_lru))
        inode_lru_list_del(inode);
    spin_unlock(&inode->i_lock);

    evict(inode);
}

索引节点对象


#define IOP_FASTPERM    0x0001
#define IOP_LOOKUP  0x0002
#define IOP_NOFOLLOW    0x0004
#define IOP_XATTR   0x0008
#define IOP_DEFAULT_READLINK    0x0010

struct inode {
    // 文件类型与访问权限
    umode_t         i_mode;
    // 上面定义的操作类型
    unsigned short      i_opflags;
    // user id 与 group id
    kuid_t          i_uid;
    kgid_t          i_gid;
    // 文件系统安装标志
    unsigned int        i_flags;
    // 访问控制列表
#ifdef CONFIG_FS_POSIX_ACL
    struct posix_acl    *i_acl;
    struct posix_acl    *i_default_acl;
#endif
    // 索引节点操作
    const struct inode_operations   *i_op;
    // 指向超级块
    struct super_block  *i_sb;
    // 指向地址空间
    struct address_space    *i_mapping;
    // 指向索引节点安全结构
#ifdef CONFIG_SECURITY
    void            *i_security;
#endif
    // 索引节点号
    /* Stat data, not accessed from path walking */
    unsigned long       i_ino;
    /*
     * Filesystems may only read i_nlink directly.  They shall use the
     * following functions for modification:
     *
     *    (set|clear|inc|drop)_nlink
     *    inode_(inc|dec)_link_count
     */
    // 硬链接数量
    union {
        const unsigned int i_nlink;
        unsigned int __i_nlink;
    };
    // 设备标识符
    dev_t           i_rdev;
    // 文件字节数
    loff_t          i_size;
    // 上次 access modify change 的时间
    struct timespec64   i_atime;
    struct timespec64   i_mtime;
    struct timespec64   i_ctime;
    // 保护索引节点一些字段的自旋锁
    spinlock_t      i_lock; /* i_blocks, i_bytes, maybe i_size */
    // 文件中最后一个块的字节数
    unsigned short          i_bytes;
    // 块的位数
    u8          i_blkbits;
    // 貌似只用了 WRITE_LIFE_NOT_SET = 0 这一个值
    u8          i_write_hint;
    // 文件的块数
    blkcnt_t        i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
    // SMP系统位i_size字段获取一致性使用的顺序计数器
    seqcount_t      i_size_seqcount;
#endif

    /* Misc */
    // 索引节点状态 比特掩码
    unsigned long       i_state;
    // 正经的读写信号量
    struct rw_semaphore i_rwsem;
    // 第一次弄脏的瞬间
    unsigned long       dirtied_when;   /* jiffies of first dirtying */
    unsigned long       dirtied_time_when;
    // 用于散列表的指针
    struct hlist_node   i_hash;
    struct list_head    i_io_list;  /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
    // 相关联的cgroup 回写
    struct bdi_writeback    *i_wb;      /* the associated cgroup wb */

    /* foreign inode detection, see wbc_detach_inode() */
    int         i_wb_frn_winner;
    u16         i_wb_frn_avg_time;
    u16         i_wb_frn_history;
#endif
    // 索引节点的LRU内存回收链表
    struct list_head    i_lru;      /* inode LRU list */
    // 用于超级块索引节点链表
    struct list_head    i_sb_list;
    struct list_head    i_wb_list;  /* backing dev writeback list */
    // 目录项对象与RCU对象
    union {
        struct hlist_head   i_dentry;
        struct rcu_head     i_rcu;
    };
    atomic64_t      i_version;
    atomic64_t      i_sequence; /* see futex */
    atomic_t        i_count;
    atomic_t        i_dio_count;
    atomic_t        i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
    atomic_t        i_readcount; /* struct files open RO */
#endif
    // 默认文件操作
    union {
        const struct file_operations    *i_fop; /* former ->i_op->default_file_ops */
        void (*free_inode)(struct inode *);
    };
    // 文件锁上下文
    struct file_lock_context    *i_flctx;
    // 地址空间
    struct address_space    i_data;
    // 用于具体的字符或块设备的指针
    struct list_head    i_devices;
    union {
        // 管道设备
        struct pipe_inode_info  *i_pipe;
        // 字符设备
        struct cdev     *i_cdev;
        // 符号链接
        char            *i_link;
        unsigned        i_dir_seq;
    };

    __u32           i_generation;

#ifdef CONFIG_FSNOTIFY
    __u32           i_fsnotify_mask; /* all events this inode cares about */
    struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
    struct fscrypt_info *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
    struct fsverity_info    *i_verity_info;
#endif

    void            *i_private; /* fs or device private pointer */
} __randomize_layout;

每个索引节点对象总是出现在下列双向循环链表中的某个链表中:

  • 有效未使用的索引节点链表

  • 正在使用的索引节点链表

  • 脏索引节点链表

索引节点的操作

struct inode_operations {
    // 为包含在一个目录项对象中的文件名对应的索引节点查找目录
    struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
    // 根据符号链接获得所指向的索引节点
    const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
    // 检查类POSIX文件系统的权限(NTFS的权限在Linux上不兼容)
    int (*permission) (struct user_namespace *, struct inode *, int);
    // 获取POSIX的访问控制列表
    struct posix_acl * (*get_acl)(struct inode *, int, bool);
    // get_link的一层覆盖,只要实现get_link即可
    int (*readlink) (struct dentry *, char __user *,int);
    // 在在某一目录下,为目录项对象创建新的磁盘索引节点
    int (*create) (struct user_namespace *, struct inode *,struct dentry *,
               umode_t, bool);
    // 创建新的硬链接
    int (*link) (struct dentry *,struct inode *,struct dentry *);
    // 删除硬链接
    int (*unlink) (struct inode *,struct dentry *);
    // 创建软链接
    int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,
            const char *);
    // 创建新的索引节点
    int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,
              umode_t);
    // 从一个目录删除目录
    int (*rmdir) (struct inode *,struct dentry *);
    // 在某个目录中位于目录项对象相关得特定文件创建一个新的磁盘索引节点
    int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,
              umode_t,dev_t);
    int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
            struct inode *, struct dentry *, unsigned int);
    // 设置获取扩展属性
    int (*setattr) (struct user_namespace *, struct dentry *,
            struct iattr *);
    int (*getattr) (struct user_namespace *, const struct path *,
            struct kstat *, u32, unsigned int);
    ssize_t (*listxattr) (struct dentry *, char *, size_t);
    // 文件Page直接映射到地址空间的Page,用户可以在应用中直接访问此page所表示的区域
    int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
              u64 len);
    // 更新索引节点的时间或者i_version
    int (*update_time)(struct inode *, struct timespec64 *, int);
    // open操作的最后一个组件,原子操作
    int (*atomic_open)(struct inode *, struct dentry *,
               struct file *, unsigned open_flag,
               umode_t create_mode);
    // O_TMPFILE open()最后调用
    int (*tmpfile) (struct user_namespace *, struct inode *,
            struct dentry *, umode_t);
    // 设置 ACL
    int (*set_acl)(struct user_namespace *, struct inode *,
               struct posix_acl *, int);
    // 文件的扩展属性
    int (*fileattr_set)(struct user_namespace *mnt_userns,
                struct dentry *dentry, struct fileattr *fa);
    int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
} ____cacheline_aligned;

文件对象

文件对象描述进程怎样与一个打开的文件进行交互。

struct file {
    union {
        //文件对象链表
        struct llist_node   fu_llist;
        //释放之后的RCU链表
        struct rcu_head     fu_rcuhead;
    } f_u;
    struct path     f_path;
    struct inode        *f_inode;   /* cached value */
    const struct file_operations    *f_op;

    /*
     * Protects f_ep, f_flags.
     * Must not be taken from IRQ context.
     */
    // 自旋锁
    spinlock_t      f_lock;
    // 貌似只用了 WRITE_LIFE_NOT_SET = 0 这一个值
    enum rw_hint        f_write_hint;
    //文件对象的使用计数
    atomic_long_t       f_count;
    //当打开文件时所使用的标志
    unsigned int        f_flags;
    //文件的访问模式
    fmode_t         f_mode;
    struct mutex        f_pos_lock;
    //文件当前的位移量
    loff_t          f_pos;
    //拥有者通过信号量进行异步I/O传输
    struct fown_struct  f_owner;
    // 任务的安全上下文
    const struct cred   *f_cred;
    // 文件提前读入page cache的状态
    struct file_ra_state    f_ra;
    // 文件版本号,每次使用后递增
    u64         f_version;
#ifdef CONFIG_SECURITY
    // 指向文件对象的安全结构
    void            *f_security;
#endif
    /* needed for tty driver, and maybe others */
    // 指向特定文件系统或设备驱动程序所需的数据
    void            *private_data;

#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    // epoll 钩子的链表
    struct hlist_head   *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
    // 页缓存映射
    struct address_space    *f_mapping;
    errseq_t        f_wb_err;
    errseq_t        f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));  /* lest something weird decides that 2 is OK */

文件对象的操作

struct file_operations {
    // 指向拥有该结构体的模块
    struct module *owner;
    // 更改当前的读写位置
    loff_t (*llseek) (struct file *, loff_t, int);
    // 读写与异步读写
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    // 
    int (*iopoll)(struct kiocb *kiocb, bool spin);
    int (*iterate) (struct file *, struct dir_context *);
    int (*iterate_shared) (struct file *, struct dir_context *);
    // epoll与select的实现
    __poll_t (*poll) (struct file *, struct poll_table_struct *);
    // ioctl 调用,发出特定设备的操作
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    // 64位系统上调用32位接口的ioctl
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    // mmap系统调用
    int (*mmap) (struct file *, struct vm_area_struct *);
    unsigned long mmap_supported_flags;
    // 首先需要打开文件
    int (*open) (struct inode *, struct file *);
    // 关闭时需要对设备进行刷新,只用于网络文件系统
    int (*flush) (struct file *, fl_owner_t id);
    // 释放文件结构
    int (*release) (struct inode *, struct file *);
    // fsync的后端和异步fsync
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    int (*fasync) (int, struct file *, int);
    // 实现文件锁定
    int (*lock) (struct file *, int, struct file_lock *);
    // 常见于网络设备socket等,发送page cache
    ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    // 在当前进程的用户空间中获得一个未映射区间的起始地址
    unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    // 由 fcntl 调用
    int (*check_flags)(int);
    // 由 flock 调用
    int (*flock) (struct file *, int, struct file_lock *);
    // 将数据从管道切分至文件
    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
    // 讲数据从文件切分至管道
    ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
    // 设置或者解开文件锁
    int (*setlease)(struct file *, long, struct file_lock **, void **);
    // 在文件系统上预分配空间(例如steam,rsync)
    long (*fallocate)(struct file *file, int mode, loff_t offset,
              loff_t len);
    // 与通知和异步(eventpoll or io_uring)实现相关
    void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
    // 返回MTD的内存兼容性(复制,直接映射,读,写,执行)
    unsigned (*mmap_capabilities)(struct file *);
#endif
    // 复制文件的实现
    ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
            loff_t, size_t, unsigned int);
    // 映射文件的一部分范围,见于网络文件系统和Btrfs和XFS
    loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                   struct file *file_out, loff_t pos_out,
                   loff_t len, unsigned int remap_flags);
    // 被fadvise64系统调用所调用,用于预声明文件数据的访问模式
    int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

API变动:get_empty_filp方法已经过时了,打开文件时,调用的是alloc_empty_file来分配SLAB高速缓存。

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
    static long old_max;
    struct file *f;

    /*
     * Privileged users can go above max_files
     */
    if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
        /*
         * percpu_counters are inaccurate.  Do an expensive check before
         * we go and fail.
         */
        if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
            goto over;
    }

    f = __alloc_file(flags, cred);
    if (!IS_ERR(f))
        percpu_counter_inc(&nr_files);

    return f;

over:
    /* Ran out of filps - report that */
    if (get_nr_files() > old_max) {
        pr_info("VFS: file-max limit %lu reached\n", get_max_files());
        old_max = get_nr_files();
    }
    return ERR_PTR(-ENFILE);
}

其中__alloc_file方法调用kmem_cache_zalloc来申请SLAB缓存,这个方法其实是kmem_cache_alloc的封装,增加了一步清零的操作。然后设置原子操作,读写锁,自旋锁,互斥锁。

static struct file *__alloc_file(int flags, const struct cred *cred)
{
    struct file *f;
    int error;

    f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
    if (unlikely(!f))
        return ERR_PTR(-ENOMEM);

    f->f_cred = get_cred(cred);
    error = security_file_alloc(f);
    if (unlikely(error)) {
        file_free_rcu(&f->f_u.fu_rcuhead);
        return ERR_PTR(error);
    }

    atomic_long_set(&f->f_count, 1);
    rwlock_init(&f->f_owner.lock);
    spin_lock_init(&f->f_lock);
    mutex_init(&f->f_pos_lock);
    f->f_flags = flags;
    f->f_mode = OPEN_FMODE(flags);
    /* f->f_version: 0 */

    return f;
}

目录项对象

VFS把目录看作若干子目录和文件组成的一个普通文件。

目录项在磁盘上没有对应的映像,存放在SLAB高速缓存中。(连dentry的定义都位于include/linux/dcache.h中)

struct dentry {
    /* RCU lookup touched fields */
    unsigned int d_flags;       /* protected by d_lock */
    // 偏向writer的自旋锁
    seqcount_spinlock_t d_seq;  /* per dentry seqlock */
    struct hlist_bl_node d_hash;    /* lookup hash list */
    struct dentry *d_parent;    /* parent directory */
    struct qstr d_name;
    struct inode *d_inode;      /* Where the name belongs to - NULL is
                     * negative */
    unsigned char d_iname[DNAME_INLINE_LEN];    /* small names */

    /* Ref lookup also touches following */
    // 带计数器的自旋锁,包括ULK里面描述的d_lock与d_count
    struct lockref d_lockref;   /* per-dentry lock and refcount */
    const struct dentry_operations *d_op;
    struct super_block *d_sb;   /* The root of the dentry tree */
    unsigned long d_time;       /* used by d_revalidate */
    void *d_fsdata;         /* fs-specific data */

    union {
        struct list_head d_lru;     /* LRU list */
        wait_queue_head_t *d_wait;  /* in-lookup ones only */
    };
    struct list_head d_child;   /* child of parent list */
    struct list_head d_subdirs; /* our children */
    /*
     * d_alias and d_rcu can share memory
     */
    union {
        struct hlist_node d_alias;  /* inode alias list */
        struct hlist_bl_node d_in_lookup_hash;  /* only for in-lookup ones */
        struct rcu_head d_rcu;
    } d_u;
} __randomize_layout;

目录项的操作

struct dentry_operations {
    // 在把目录项对象转换为一个文件路径名之前,判定该目录项对象是否仍然有效
    int (*d_revalidate)(struct dentry *, unsigned int);
    // 更倾向于关心索引节点是否有效
    int (*d_weak_revalidate)(struct dentry *, unsigned int);
    // 生成散列值
    int (*d_hash)(const struct dentry *, struct qstr *);
    //  比较两个文件名
    int (*d_compare)(const struct dentry *,
            unsigned int, const char *, const struct qstr *);
    // 当目录项对象的最后一个引用被删除时,调用delete,解除所有的锁
    int (*d_delete)(const struct dentry *);
    // 仅有Ceph和FUSE实例化了该方法,分配d_fsdata
    int (*d_init)(struct dentry *);
    // 释放目录项对象(SLAB)
    void (*d_release)(struct dentry *);
    // VFS将dentry从缓存中删除后,需要删除父目录的全部flags
    void (*d_prune)(struct dentry *);
    // 仅有此处的声明,看来目录项的释放节点的方法也更改了
    void (*d_iput)(struct dentry *, struct inode *);
    // 用于生成路径名
    char *(*d_dname)(struct dentry *, char *, int);
    // 当自动挂载的目录项被遍历到
    struct vfsmount *(*d_automount)(struct path *);
    // 允许让文件系统来管理目录项对象
    int (*d_manage)(const struct path *, bool);
    // 返回被目录项对象掩盖的真正的目录项对象
    struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;

VFS的系统调用

这些系统调用用于处理VFS对象,这一部分以文件系统常用的系统调用为主。

文件系统注册

使用模块与编译进内核的区别在于是使用时进行注册还是开机时进行注册。

文件系统定义在include/linux/fs.h中:

struct file_system_type {
    const char *name;
    //挂载的标识位
    int fs_flags;
#define FS_REQUIRES_DEV     1 
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE      4
#define FS_USERNS_MOUNT     8   /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16  /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_THP_SUPPORT      8192    /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE   32768   /* FS will handle d_move() during rename() internally. */
    int (*init_fs_context)(struct fs_context *);
    const struct fs_parameter_spec *parameters;
    struct dentry *(*mount) (struct file_system_type *, int,
               const char *, void *);
    // 不需要某个文件系统时进行清理
    void (*kill_sb) (struct super_block *);
    struct module *owner;
    // 下一个文件系统类型
    struct file_system_type * next;
    // 聚集了多个超级快结构的表头
    struct hlist_head fs_supers;

    struct lock_class_key s_lock_key;
    struct lock_class_key s_umount_key;
    struct lock_class_key s_vfs_rename_key;
    struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

    struct lock_class_key i_lock_key;
    struct lock_class_key i_mutex_key;
    struct lock_class_key invalidate_lock_key;
    struct lock_class_key i_mutex_dir_key;
};

注册文件系统:所有文件系统保存在static struct file_system_type *file_systems;中,注册时遍历该链表,直至达到末尾,并将新的文件系统添加到链表末尾。

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
    struct file_system_type **p;
    // 遍历链表,返回找到的文件系统,没找到就返回空指针
    for (p = &file_systems; *p; p = &(*p)->next)
        if (strncmp((*p)->name, name, len) == 0 &&
            !(*p)->name[len])
            break;
    return p;
}

/**
 *  register_filesystem - register a new filesystem
 *  @fs: the file system structure
 *
 *  Adds the file system passed to the list of file systems the kernel
 *  is aware of for mount and other syscalls. Returns 0 on success,
 *  or a negative errno code on an error.
 *
 *  The &struct file_system_type that is passed is linked into the kernel 
 *  structures and must not be freed until the file system has been
 *  unregistered.
 */

int register_filesystem(struct file_system_type * fs)
{
    int res = 0;
    struct file_system_type ** p;

    if (fs->parameters &&
        !fs_validate_description(fs->name, fs->parameters))
        return -EINVAL;

    BUG_ON(strchr(fs->name, '.'));
    if (fs->next)
        return -EBUSY;
    write_lock(&file_systems_lock);
    p = find_filesystem(fs->name, strlen(fs->name));
    if (*p)
        res = -EBUSY;
    else
    // 如果是空指针,就在链表最后加上注册的文件系统
        *p = fs;
    write_unlock(&file_systems_lock);
    return res;
}

mount()系统调用

每个挂载的文件系统对应于一个vfsmount的实例。

vfsmount定义于include/linux/mount.h:

struct vfsmount {
    struct dentry *mnt_root;    /* root of the mounted tree */
    struct super_block *mnt_sb; /* pointer to superblock */
    int mnt_flags;
    struct user_namespace *mnt_userns;
} __randomize_layout;

而对于文件系统,vfsmount作为结构体mount中的一个元素,定义于fs/mount.h:

struct mnt_namespace {
    struct ns_common    ns;
    struct mount *  root;
    /*
     * Traversal and modification of .list is protected by either
     * - taking namespace_sem for write, OR
     * - taking namespace_sem for read AND taking .ns_lock.
     */
    struct list_head    list;
    spinlock_t      ns_lock;
    struct user_namespace   *user_ns;
    struct ucounts      *ucounts;
    u64         seq;    /* Sequence number to prevent loops */
    wait_queue_head_t poll;
    u64 event;
    unsigned int        mounts; /* # of mounts in the namespace */
    unsigned int        pending_mounts;
} __randomize_layout;

struct mnt_pcp {
    int mnt_count;
    int mnt_writers;
};

struct mountpoint {
    struct hlist_node m_hash;
    struct dentry *m_dentry;
    struct hlist_head m_list;
    int m_count;
};

struct mount {
    struct hlist_node mnt_hash;
    struct mount *mnt_parent;
    struct dentry *mnt_mountpoint;
    // 此处定义了vfsmount
    struct vfsmount mnt;
    union {
        struct rcu_head mnt_rcu;
        struct llist_node mnt_llist;
    };
#ifdef CONFIG_SMP
    struct mnt_pcp __percpu *mnt_pcp;
#else
    int mnt_count;
    int mnt_writers;
#endif
    struct list_head mnt_mounts;    /* list of children, anchored here */
    struct list_head mnt_child; /* and going through their mnt_child */
    struct list_head mnt_instance;  /* mount instance on sb->s_mounts */
    const char *mnt_devname;    /* Name of device e.g. /dev/dsk/hda1 */
    struct list_head mnt_list;
    struct list_head mnt_expire;    /* link in fs-specific expiry list */
    struct list_head mnt_share; /* circular list of shared mounts */
    struct list_head mnt_slave_list;/* list of slave mounts */
    struct list_head mnt_slave; /* slave list entry */
    struct mount *mnt_master;   /* slave is on master->mnt_slave_list */
    struct mnt_namespace *mnt_ns;   /* containing namespace */
    struct mountpoint *mnt_mp;  /* where is it mounted */
    union {
        struct hlist_node mnt_mp_list;  /* list mounts with the same mountpoint */
        struct hlist_node mnt_umount;
    };
    struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
    struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
    __u32 mnt_fsnotify_mask;
#endif
    int mnt_id;         /* mount identifier */
    int mnt_group_id;       /* peer group identifier */
    int mnt_expiry_mark;        /* true if marked for expiry */
    struct hlist_head mnt_pins;
    struct hlist_head mnt_stuck_children;
} __randomize_layout;

以上是文件系统的mount数据结构。


那么mount操作是怎么执行的呢?

通过qfile mount查询mount命令所属的包,来看一下mount的实现,其调用了glibc对内核的系统调用封装,在/usr/include/sys/mount.h中,我们可以使用glibc封装的接口。

fs/namespace.c中,Kernel定义了mount的系统调用,

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        char __user *, type, unsigned long, flags, void __user *, data)
{
    int ret;
    char *kernel_type;
    char *kernel_dev;
    void *options;

    kernel_type = copy_mount_string(type);
    ret = PTR_ERR(kernel_type);
    if (IS_ERR(kernel_type))
        goto out_type;

    kernel_dev = copy_mount_string(dev_name);
    ret = PTR_ERR(kernel_dev);
    if (IS_ERR(kernel_dev))
        goto out_dev;

    options = copy_mount_options(data);
    ret = PTR_ERR(options);
    if (IS_ERR(options))
        goto out_data;

    ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

    kfree(options);
out_data:
    kfree(kernel_dev);
out_dev:
    kfree(kernel_type);
out_type:
    return ret;
}

这里首先复制了mount系统调用的参数,执行do_mount后,再执行GC。

long do_mount(const char *dev_name, const char __user *dir_name,
        const char *type_page, unsigned long flags, void *data_page)
{
    struct path path;
    int ret;

    ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
    if (ret)
        return ret;
    ret = path_mount(dev_name, &path, type_page, flags, data_page);
    path_put(&path);
    return ret;
}

这里首先查找路径找到dentry,然后使用path_mount处理一系列标志位,然后执行do_new_mount

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
            int mnt_flags, const char *name, void *data)
{
    struct file_system_type *type;
    struct fs_context *fc;
    const char *subtype = NULL;
    int err = 0;

    if (!fstype)
        return -EINVAL;

    type = get_fs_type(fstype);
    if (!type)
        return -ENODEV;

    if (type->fs_flags & FS_HAS_SUBTYPE) {
        subtype = strchr(fstype, '.');
        if (subtype) {
            subtype++;
            if (!*subtype) {
                put_filesystem(type);
                return -EINVAL;
            }
        }
    }

    fc = fs_context_for_mount(type, sb_flags);
    put_filesystem(type);
    if (IS_ERR(fc))
        return PTR_ERR(fc);

    if (subtype)
        err = vfs_parse_fs_string(fc, "subtype",
                      subtype, strlen(subtype));
    if (!err && name)
        err = vfs_parse_fs_string(fc, "source", name, strlen(name));
    if (!err)
        err = parse_monolithic_mount_data(fc, data);
    if (!err && !mount_capable(fc))
        err = -EPERM;
    if (!err)
        err = vfs_get_tree(fc);
    if (!err)
        err = do_new_mount_fc(fc, path, mnt_flags);

    put_fs_context(fc);
    return err;
}

这里在用户空间创建新的挂载,然后请求加入命名空间的树,使用do_new_mount_fc在文件系统上下文中新建mount,保证没有重复挂载,do_kern_mount的字样可能已经变成了这里的security_sb_kern_mount

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
               unsigned int mnt_flags)
{
    struct vfsmount *mnt;
    struct mountpoint *mp;
    struct super_block *sb = fc->root->d_sb;
    int error;

    error = security_sb_kern_mount(sb);
    if (!error && mount_too_revealing(sb, &mnt_flags))
        error = -EPERM;

    if (unlikely(error)) {
        fc_drop_locked(fc);
        return error;
    }

    up_write(&sb->s_umount);

    mnt = vfs_create_mount(fc);
    if (IS_ERR(mnt))
        return PTR_ERR(mnt);

    mnt_warn_timestamp_expiry(mountpoint, mnt);

    mp = lock_mount(mountpoint);
    if (IS_ERR(mp)) {
        mntput(mnt);
        return PTR_ERR(mp);
    }
    error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
    unlock_mount(mp);
    if (error < 0)
        mntput(mnt);
    return error;
}

最后执行do_add_mount将新的mount加入到graft_tree

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
            struct path *path, int mnt_flags)
{
    struct mount *parent = real_mount(path->mnt);

    mnt_flags &= ~MNT_INTERNAL_FLAGS;

    if (unlikely(!check_mnt(parent))) {
        /* that's acceptable only for automounts done in private ns */
        if (!(mnt_flags & MNT_SHRINKABLE))
            return -EINVAL;
        /* ... and for those we'd better have mountpoint still alive */
        if (!parent->mnt_ns)
            return -EINVAL;
    }

    /* Refuse the same filesystem on the same mount point */
    if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
        path->mnt->mnt_root == path->dentry)
        return -EBUSY;

    if (d_is_symlink(newmnt->mnt.mnt_root))
        return -EINVAL;

    newmnt->mnt.mnt_flags = mnt_flags;
    return graft_tree(newmnt, parent, mp);
}

graft_tree使用attach_recursive_mnt来添加,这能够保证用户空间对mount节点的共享与隔离。

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
    if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
        return -EINVAL;

    if (d_is_dir(mp->m_dentry) !=
          d_is_dir(mnt->mnt.mnt_root))
        return -ENOTDIR;

    return attach_recursive_mnt(mnt, p, mp, false);
}

umount()系统调用

很多地方与mount想对应,同样在fs/namespace.c中使用SYSCALL_DEFINE2宏定义了umount系统调用:

static int ksys_umount(char __user *name, int flags)
{
    int lookup_flags = LOOKUP_MOUNTPOINT;
    struct path path;
    int ret;

    // basic validity checks done first
    if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
        return -EINVAL;

    if (!(flags & UMOUNT_NOFOLLOW))
        lookup_flags |= LOOKUP_FOLLOW;
    ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
    if (ret)
        return ret;
    return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
    return ksys_umount(name, flags);
}

使用user_path_at获取dentry后,使用path_umount来调用do_umount

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
    struct mount *mnt = real_mount(path->mnt);
    int ret;

    ret = can_umount(path, flags);
    if (!ret)
        ret = do_umount(mnt, flags);

    /* we mustn't call path_put() as that would clear mnt_expiry_mark */
    dput(path->dentry);
    mntput_no_expire(mnt);
    return ret;
}

判断是否可以卸载后,再卸载:

static int do_umount(struct mount *mnt, int flags)
{
    struct super_block *sb = mnt->mnt.mnt_sb;
    int retval;

    retval = security_sb_umount(&mnt->mnt, flags);
    if (retval)
        return retval;

    /*
     * Allow userspace to request a mountpoint be expired rather than
     * unmounting unconditionally. Unmount only happens if:
     *  (1) the mark is already set (the mark is cleared by mntput())
     *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
     */
    if (flags & MNT_EXPIRE) {
        if (&mnt->mnt == current->fs->root.mnt ||
            flags & (MNT_FORCE | MNT_DETACH))
            return -EINVAL;

        /*
         * probably don't strictly need the lock here if we examined
         * all race cases, but it's a slowpath.
         */
        lock_mount_hash();
        if (mnt_get_count(mnt) != 2) {
            unlock_mount_hash();
            return -EBUSY;
        }
        unlock_mount_hash();

        if (!xchg(&mnt->mnt_expiry_mark, 1))
            return -EAGAIN;
    }

    /*
     * If we may have to abort operations to get out of this
     * mount, and they will themselves hold resources we must
     * allow the fs to do things. In the Unix tradition of
     * 'Gee thats tricky lets do it in userspace' the umount_begin
     * might fail to complete on the first run through as other tasks
     * must return, and the like. Thats for the mount program to worry
     * about for the moment.
     */

    if (flags & MNT_FORCE && sb->s_op->umount_begin) {
        sb->s_op->umount_begin(sb);
    }

    /*
     * No sense to grab the lock for this test, but test itself looks
     * somewhat bogus. Suggestions for better replacement?
     * Ho-hum... In principle, we might treat that as umount + switch
     * to rootfs. GC would eventually take care of the old vfsmount.
     * Actually it makes sense, especially if rootfs would contain a
     * /reboot - static binary that would close all descriptors and
     * call reboot(9). Then init(8) could umount root and exec /reboot.
     */
    if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
        /*
         * Special case for "unmounting" root ...
         * we just try to remount it readonly.
         */
        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
            return -EPERM;
        return do_umount_root(sb);
    }

    namespace_lock();
    lock_mount_hash();

    /* Recheck MNT_LOCKED with the locks held */
    retval = -EINVAL;
    if (mnt->mnt.mnt_flags & MNT_LOCKED)
        goto out;

    event++;
    if (flags & MNT_DETACH) {
        if (!list_empty(&mnt->mnt_list))
            umount_tree(mnt, UMOUNT_PROPAGATE);
        retval = 0;
    } else {
        shrink_submounts(mnt);
        retval = -EBUSY;
        if (!propagate_mount_busy(mnt, 2)) {
            if (!list_empty(&mnt->mnt_list))
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
            retval = 0;
        }
    }
out:
    unlock_mount_hash();
    namespace_unlock();
    return retval;
}

这里保证umount_begin,或者不在使用/强制卸载后,再交给umount_tree来卸载:

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
    LIST_HEAD(tmp_list);
    struct mount *p;

    if (how & UMOUNT_PROPAGATE)
        propagate_mount_unlock(mnt);

    /* Gather the mounts to umount */
    for (p = mnt; p; p = next_mnt(p, mnt)) {
        p->mnt.mnt_flags |= MNT_UMOUNT;
        list_move(&p->mnt_list, &tmp_list);
    }

    /* Hide the mounts from mnt_mounts */
    list_for_each_entry(p, &tmp_list, mnt_list) {
        list_del_init(&p->mnt_child);
    }

    /* Add propogated mounts to the tmp_list */
    if (how & UMOUNT_PROPAGATE)
        propagate_umount(&tmp_list);

    while (!list_empty(&tmp_list)) {
        struct mnt_namespace *ns;
        bool disconnect;
        p = list_first_entry(&tmp_list, struct mount, mnt_list);
        list_del_init(&p->mnt_expire);
        list_del_init(&p->mnt_list);
        ns = p->mnt_ns;
        if (ns) {
            ns->mounts--;
            __touch_mnt_namespace(ns);
        }
        p->mnt_ns = NULL;
        if (how & UMOUNT_SYNC)
            p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

        disconnect = disconnect_mount(p, how);
        if (mnt_has_parent(p)) {
            mnt_add_count(p->mnt_parent, -1);
            if (!disconnect) {
                /* Don't forget about p */
                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
            } else {
                umount_mnt(p);
            }
        }
        change_mnt_propagation(p, MS_PRIVATE);
        if (disconnect)
            hlist_add_head(&p->mnt_umount, &unmounted);
    }
}

使用一个循环将需要卸载的挂载点聪内核链表中删除。

open()系统调用

open是使用文件名来获取文件描述符的方法。

/usr/include/fcntl.h中,glibc声明了open这个调用,我们可以使用这个封装来open一个文件。

kernel在fs/open.c中定义了open系统调用:

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;
    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
        umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;
    return do_sys_open(dfd, filename, flags, mode);
}

openopenat的区别在于后者的目录可以是相对的,这样允许不同程序拥有不同的当前目录。

force_o_largefile检查是否应该不考虑用户层传递的标志,总是设置O_LARGEFILE

然后使用do_sys_openat2来打开文件:

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
    struct open_how how = build_open_how(flags, mode);
    return do_sys_openat2(dfd, filename, &how);
}

static long do_sys_openat2(int dfd, const char __user *filename,
               struct open_how *how)
{
    struct open_flags op;
    int fd = build_open_flags(how, &op);
    struct filename *tmp;

    if (fd)
        return fd;

    tmp = getname(filename);
    if (IS_ERR(tmp))
        return PTR_ERR(tmp);

    fd = get_unused_fd_flags(how->flags);
    if (fd >= 0) {
        struct file *f = do_filp_open(dfd, tmp, &op);
        if (IS_ERR(f)) {
            put_unused_fd(fd);
            fd = PTR_ERR(f);
        } else {
            fsnotify_open(f);
            fd_install(fd, f);
        }
    }
    putname(tmp);
    return fd;
}
  1. 首先获取一个未使用的文件描述符

  2. do_filp_open来查找问价你的inode

  3. 将file实例放入文件描述符数组

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
    struct path path;
    int error = path_lookupat(nd, flags, &path);
    if (!error) {
        audit_inode(nd->name, path.dentry, 0);
        error = vfs_open(&path, file);
        path_put(&path);
    }
    return error;
}

static struct file *path_openat(struct nameidata *nd,
            const struct open_flags *op, unsigned flags)
{
    struct file *file;
    int error;

    file = alloc_empty_file(op->open_flag, current_cred());
    if (IS_ERR(file))
        return file;

    if (unlikely(file->f_flags & __O_TMPFILE)) {
        error = do_tmpfile(nd, flags, op, file);
    } else if (unlikely(file->f_flags & O_PATH)) {
        error = do_o_path(nd, flags, file);
    } else {
        const char *s = path_init(nd, flags);
        while (!(error = link_path_walk(s, nd)) &&
               (s = open_last_lookups(nd, file, op)) != NULL)
            ;
        if (!error)
            error = do_open(nd, file, op);
        terminate_walk(nd);
    }
    if (likely(!error)) {
        if (likely(file->f_mode & FMODE_OPENED))
            return file;
        WARN_ON(1);
        error = -EINVAL;
    }
    fput(file);
    if (error == -EOPENSTALE) {
        if (flags & LOOKUP_RCU)
            error = -ECHILD;
        else
            error = -ESTALE;
    }
    return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
        const struct open_flags *op)
{
    struct nameidata nd;
    int flags = op->lookup_flags;
    struct file *filp;

    set_nameidata(&nd, dfd, pathname, NULL);
    filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    if (unlikely(filp == ERR_PTR(-ECHILD)))
        filp = path_openat(&nd, op, flags);
    if (unlikely(filp == ERR_PTR(-ESTALE)))
        filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
    restore_nameidata();
    return filp;
}

同样一次xxat的封装,最后到vfs_open来去file。

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 * @cred: credentials to use
 */
int vfs_open(const struct path *path, struct file *file)
{
    file->f_path = *path;
    return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
static int do_dentry_open(struct file *f,
              struct inode *inode,
              int (*open)(struct inode *, struct file *))
{
    static const struct file_operations empty_fops = {};
    int error;

    path_get(&f->f_path);
    f->f_inode = inode;
    f->f_mapping = inode->i_mapping;
    f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
    f->f_sb_err = file_sample_sb_err(f);

    if (unlikely(f->f_flags & O_PATH)) {
        f->f_mode = FMODE_PATH | FMODE_OPENED;
        f->f_op = &empty_fops;
        return 0;
    }

    if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
        error = get_write_access(inode);
        if (unlikely(error))
            goto cleanup_file;
        error = __mnt_want_write(f->f_path.mnt);
        if (unlikely(error)) {
            put_write_access(inode);
            goto cleanup_file;
        }
        f->f_mode |= FMODE_WRITER;
    }

    /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
    if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
        f->f_mode |= FMODE_ATOMIC_POS;

    f->f_op = fops_get(inode->i_fop);
    if (WARN_ON(!f->f_op)) {
        error = -ENODEV;
        goto cleanup_all;
    }

    error = security_file_open(f);
    if (error)
        goto cleanup_all;

    error = break_lease(locks_inode(f), f->f_flags);
    if (error)
        goto cleanup_all;

    /* normally all 3 are set; ->open() can clear them if needed */
    f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
    if (!open)
        open = f->f_op->open;
    if (open) {
        error = open(inode, f);
        if (error)
            goto cleanup_all;
    }
    f->f_mode |= FMODE_OPENED;
    if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
        i_readcount_inc(inode);
    if ((f->f_mode & FMODE_READ) &&
         likely(f->f_op->read || f->f_op->read_iter))
        f->f_mode |= FMODE_CAN_READ;
    if ((f->f_mode & FMODE_WRITE) &&
         likely(f->f_op->write || f->f_op->write_iter))
        f->f_mode |= FMODE_CAN_WRITE;

    f->f_write_hint = WRITE_LIFE_NOT_SET;
    f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

    file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

    /* NB: we're sure to have correct a_ops only after f_op->open */
    if (f->f_flags & O_DIRECT) {
        if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
            return -EINVAL;
    }

    /*
     * XXX: Huge page cache doesn't support writing yet. Drop all page
     * cache for this file before processing writes.
     */
    if (f->f_mode & FMODE_WRITE) {
        /*
         * Paired with smp_mb() in collapse_file() to ensure nr_thps
         * is up to date and the update to i_writecount by
         * get_write_access() is visible. Ensures subsequent insertion
         * of THPs into the page cache will fail.
         */
        smp_mb();
        if (filemap_nr_thps(inode->i_mapping)) {
            struct address_space *mapping = inode->i_mapping;

            filemap_invalidate_lock(inode->i_mapping);
            /*
             * unmap_mapping_range just need to be called once
             * here, because the private pages is not need to be
             * unmapped mapping (e.g. data segment of dynamic
             * shared libraries here).
             */
            unmap_mapping_range(mapping, 0, 0, 0);
            truncate_inode_pages(mapping, 0);
            filemap_invalidate_unlock(inode->i_mapping);
        }
    }

    return 0;

cleanup_all:
    if (WARN_ON_ONCE(error > 0))
        error = -EINVAL;
    fops_put(f->f_op);
    if (f->f_mode & FMODE_WRITER) {
        put_write_access(inode);
        __mnt_drop_write(f->f_path.mnt);
    }
cleanup_file:
    path_put(&f->f_path);
    f->f_path.mnt = NULL;
    f->f_path.dentry = NULL;
    f->f_inode = NULL;
    return error;
}

close()系统调用

close系统调用用于解除文件描述符与对应文件的关系,kernel同样在fs/open.c中提供了两种调用,分别可以关闭一个和一堆连续的描述符:

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
    int retval = close_fd(fd);

    /* can't restart close syscall because file table entry was cleared */
    if (unlikely(retval == -ERESTARTSYS ||
             retval == -ERESTARTNOINTR ||
             retval == -ERESTARTNOHAND ||
             retval == -ERESTART_RESTARTBLOCK))
        retval = -EINTR;

    return retval;
}

/**
 * close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  reserved for future extensions
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
        unsigned int, flags)
{
    return __close_range(fd, max_fd, flags);
}

前者如下直接找到file后调用filp_close,后者使用__range_close调用filp_close

int close_fd(unsigned fd)
{
    struct files_struct *files = current->files;
    struct file *file;

    file = pick_file(files, fd);
    if (IS_ERR(file))
        return -EBADF;

    return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */

filp_close移除POSIX的文件锁

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op->flush)
        retval = filp->f_op->flush(filp, id);

    if (likely(!(filp->f_mode & FMODE_PATH))) {
        dnotify_flush(filp, id);
        locks_remove_posix(filp, id);
    }
    fput(filp);
    return retval;
}

EXPORT_SYMBOL(filp_close);
/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
    int error;
    struct inode *inode = locks_inode(filp);
    struct file_lock lock;
    struct file_lock_context *ctx;

    /*
     * If there are no locks held on this file, we don't need to call
     * posix_lock_file().  Another process could be setting a lock on this
     * file at the same time, but we wouldn't remove that lock anyway.
     */
    ctx =  smp_load_acquire(&inode->i_flctx);
    if (!ctx || list_empty(&ctx->flc_posix))
        return;

    locks_init_lock(&lock);
    lock.fl_type = F_UNLCK;
    lock.fl_flags = FL_POSIX | FL_CLOSE;
    lock.fl_start = 0;
    lock.fl_end = OFFSET_MAX;
    lock.fl_owner = owner;
    lock.fl_pid = current->tgid;
    lock.fl_file = filp;
    lock.fl_ops = NULL;
    lock.fl_lmops = NULL;

    error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

    if (lock.fl_ops && lock.fl_ops->fl_release_private)
        lock.fl_ops->fl_release_private(&lock);
    trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

read()系统调用

glibc的封装在posix/unistd.h中声明:

/* Read NBYTES into BUF from FD.  Return the
   number read, -1 for errors or 0 for EOF.

   This function is a cancellation point and therefore not marked with
   __THROW.  */
extern ssize_t read (int __fd, void *__buf, size_t __nbytes) __wur
    __fortified_attr_access (__write_only__, 2, 3);

sysdeps/unix/sysv/linux/read.c中定义了相对应的glibc的封装:

#include <unistd.h>
#include <sysdep-cancel.h>

/* Read NBYTES into BUF from FD.  Return the number read or -1.  */
ssize_t
__libc_read (int fd, void *buf, size_t nbytes)
{
  return SYSCALL_CANCEL (read, fd, buf, nbytes);
}
libc_hidden_def (__libc_read)

libc_hidden_def (__read)
weak_alias (__libc_read, __read)
libc_hidden_def (read)
weak_alias (__libc_read, read)

最后使用make-syscalls.sh来封装接口:

read        -   read        Ci:ibU  __libc_read __read read

read系统调用在fs/read_write.c中实现:

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
    struct fd f = fdget_pos(fd);
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos, *ppos = file_ppos(f.file);
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret = vfs_read(f.file, buf, count, ppos);
        if (ret >= 0 && ppos)
            f.file->f_pos = pos;
        fdput_pos(f);
    }
    return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    return ksys_read(fd, buf, count);
}

read系统调用需要三个参数,他根据文件描述符fd从文件中读取一定大小count的数据到缓冲区buf中。

文件描述符查找

使用一个轻量的文件描述符查找,返回一个包含指向file的指针和flags的结构体:

struct fd {
    struct file *file;
    unsigned int flags;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

这一部分有点黑魔法,但是首先找到如何通过文件描述符查找文件的。

fdget_pos方法定义在include/linux/file.h中,

// ....
static inline struct fd __to_fd(unsigned long v)
{
    return (struct fd){(struct file *)(v & ~3),v & 3};
}
// ....
static inline struct fd fdget_pos(int fd)
{
    return __to_fd(__fdget_pos(fd));
}
// ....

__fdget_pos定义在fs/file.c中:

unsigned long __fdget_pos(unsigned int fd)
{
    unsigned long v = __fdget(fd);
    struct file *file = (struct file *)(v & ~3);

    if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
        if (file_count(file) > 1) {
            v |= FDPUT_POS_UNLOCK;
            mutex_lock(&file->f_pos_lock);
        }
    }
    return v;
}

__fdget(fd)通过__fget_light这个轻量的文件查找方法,获取file实例。

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
    struct files_struct *files = current->files;
    struct file *file;

    if (atomic_read(&files->count) == 1) {
        file = files_lookup_fd_raw(files, fd);
        if (!file || unlikely(file->f_mode & mask))
            return 0;
        return (unsigned long)file;
    } else {
        file = __fget(fd, mask, 1);
        if (!file)
            return 0;
        return FDPUT_FPUT | (unsigned long)file;
    }
}
unsigned long __fdget(unsigned int fd)
{
    return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);

获取当前cpu的打开的文件信息:

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
    atomic_t count;
    bool resize_in_progress;
    wait_queue_head_t resize_wait;

    struct fdtable __rcu *fdt;
    struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    unsigned int next_fd;
    unsigned long close_on_exec_init[1];
    unsigned long open_fds_init[1];
    unsigned long full_fds_bits_init[1];
    struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

文件描述符与文件的对应关系存在于下面的数据结构中:

struct fdtable {
    unsigned int max_fds;
    struct file __rcu **fd;      /* current fd array */
    unsigned long *close_on_exec;
    unsigned long *open_fds;
    unsigned long *full_fds_bits;
    struct rcu_head rcu;
};

这里首先使用atomic_read读取原子变量atomic_t count:

  • 如果为1则说明有其他程序在给文件描述符的原子操作上了锁,那么我们在读的时候就需要利用rcu机制获取未完成的写之前的副本。

files_lookup_fd_raw首先获得原始的文件描述符表,然后返回原始的文件描述符表中文件描述符数组对应的文件实例。

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
    struct fdtable *fdt = rcu_dereference_raw(files->fdt);

    if (fd < fdt->max_fds) {
        fd = array_index_nospec(fd, fdt->max_fds);
        return rcu_dereference_raw(fdt->fd[fd]);
    }
    return NULL;
}
  • 如果不为1,说明没有原子锁,咱是第一个动这个文件的程序,需要使用__fget来获取文件,同时也不要忘记给fdtable上原子锁。首先获得当前CPU的文件描述符对应关系,然后使用RCU读到原始的对应关系,在__fget_files_rcu中将refs传给原子锁就能锁上了。
static inline struct file *__fget_files_rcu(struct files_struct *files,
    unsigned int fd, fmode_t mask, unsigned int refs)
{
    for (;;) {
        struct file *file;
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        struct file __rcu **fdentry;

        if (unlikely(fd >= fdt->max_fds))
            return NULL;

        fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
        file = rcu_dereference_raw(*fdentry);
        if (unlikely(!file))
            return NULL;

        if (unlikely(file->f_mode & mask))
            return NULL;

        /*
         * Ok, we have a file pointer. However, because we do
         * this all locklessly under RCU, we may be racing with
         * that file being closed.
         *
         * Such a race can take two forms:
         *
         *  (a) the file ref already went down to zero,
         *      and get_file_rcu_many() fails. Just try
         *      again:
         */
        if (unlikely(!get_file_rcu_many(file, refs)))
            continue;

        /*
         *  (b) the file table entry has changed under us.
         *       Note that we don't need to re-check the 'fdt->fd'
         *       pointer having changed, because it always goes
         *       hand-in-hand with 'fdt'.
         *
         * If so, we need to put our refs and try again.
         */
        if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
            unlikely(rcu_dereference_raw(*fdentry) != file)) {
            fput_many(file, refs);
            continue;
        }

        /*
         * Ok, we have a ref to the file, and checked that it
         * still exists.
         */
        return file;
    }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                 fmode_t mask, unsigned int refs)
{
    struct file *file;

    rcu_read_lock();
    file = __fget_files_rcu(files, fd, mask, refs);
    rcu_read_unlock();

    return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask,
                  unsigned int refs)
{
    return __fget_files(current->files, fd, mask, refs);
}

最后进行一下判断,如果没有找到文件,那就退出。

如果当前线程的上下文之前打开了相同的文件,那么原子锁也会是1,所以与FMODE_PATH相与,如果中了,那么也不会返回对应的文件。这一种情况也在__fget中考虑到了。

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH      ((__force fmode_t)0x4000)

后者还会做一个FDPUT_FPUT | (unsigned long)file的操作这会将最后一位置1,代表read结束后需要减小文件引用计数。

获取一个强转成(unsigned long)v的文件之后,在将其转为file实例。

最后确定文件获取到了,并且可以进行原子操作,再对v的标志位加上FDPUT_POS_UNLOCK,代表read结束后需要释放文件位置锁。

ret 初始化

在linux-headers提供的/usr/include/asm-generic/errno-base.h中定义了Linux中的一些erro:

#define EBADF        9  /* Bad file number */
    ssize_t ret = -EBADF;
获取当前位置

如果文件是stream则返回NULL,否则返回位置

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
    return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
    struct fd f = fdget_pos(fd);
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos, *ppos = file_ppos(f.file);
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret = vfs_read(f.file, buf, count, ppos);
        if (ret >= 0 && ppos)
            f.file->f_pos = pos;
        fdput_pos(f);
    }
    return ret;
}
vfs_read

一些标志位:

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME      ((__force fmode_t)0x800)

/* Expect random access pattern */
#define FMODE_RANDOM        ((__force fmode_t)0x1000)

/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
#define FMODE_UNSIGNED_OFFSET   ((__force fmode_t)0x2000)

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH      ((__force fmode_t)0x4000)

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS    ((__force fmode_t)0x8000)
/* Write access to underlying fs */
#define FMODE_WRITER        ((__force fmode_t)0x10000)
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)0x20000)
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)

#define FMODE_OPENED        ((__force fmode_t)0x80000)
#define FMODE_CREATED       ((__force fmode_t)0x100000)

/* File is stream-like */
#define FMODE_STREAM        ((__force fmode_t)0x200000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY      ((__force fmode_t)0x4000000)

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT        ((__force fmode_t)0x8000000)

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT  ((__force fmode_t)0x10000000)

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT     ((__force fmode_t)0x20000000)

/* File supports async buffered reads */
#define FMODE_BUF_RASYNC    ((__force fmode_t)0x40000000)

先进行一些错误判断,是否打开了捏?是否存在读的方法捏?缓冲区是否足够大捏?

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!(file->f_mode & FMODE_CAN_READ))
        return -EINVAL;
    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);
    if (ret)
        return ret;
    if (count > MAX_RW_COUNT)
        count =  MAX_RW_COUNT;

    if (file->f_op->read)
        ret = file->f_op->read(file, buf, count, pos);
    else if (file->f_op->read_iter)
        ret = new_sync_read(file, buf, count, pos);
    else
        ret = -EINVAL;
    if (ret > 0) {
        fsnotify_access(file);
        add_rchar(current, ret);
    }
    inc_syscr(current);
    return ret;
}

然后再确认文件读写区域,这里会判断位置是否有效,文件是否有权限:

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
    if (unlikely((ssize_t) count < 0))
        return -EINVAL;

    /*
     * ranged mandatory locking does not apply to streams - it makes sense
     * only for files where position has a meaning.
     */
    if (ppos) {
        loff_t pos = *ppos;

        if (unlikely(pos < 0)) {
            if (!unsigned_offsets(file))
                return -EINVAL;
            if (count >= -pos) /* both values are in 0..LLONG_MAX */
                return -EOVERFLOW;
        } else if (unlikely((loff_t) (pos + count) < 0)) {
            if (!unsigned_offsets(file))
                return -EINVAL;
        }
    }

    return security_file_permission(file,
                read_write == READ ? MAY_READ : MAY_WRITE);
}

判断一下是否达到了读写限制,然后根据当前文件是否定义了读取的方法,如果不存在就使用一般的同步读取的方法。读取完成后通知文件系统的读取操作。后面的两个方法还是空的。

new_sync_read

通用的辅助函数:

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    struct iov_iter iter;
    ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = (ppos ? *ppos : 0);
    iov_iter_init(&iter, READ, &iov, 1, len);

    ret = call_read_iter(filp, &kiocb, &iter);
    BUG_ON(ret == -EIOCBQUEUED);
    if (ppos)
        *ppos = kiocb.ki_pos;
    return ret;
}

前面的一些变量用于控制异步输入输出动作:

/* Structure for scatter/gather I/O.  */
struct iovec
  {
    void *iov_base; /* Pointer to data.  */
    size_t iov_len; /* Length of data.  */
  };
// ...
struct kiocb {
    struct file     *ki_filp;

    /* The 'ki_filp' pointer is shared in a union for aio */
    randomized_struct_fields_start

    loff_t          ki_pos;
    void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
    void            *private;
    int         ki_flags;
    u16         ki_hint;
    u16         ki_ioprio; /* See linux/ioprio.h */
    union {
        unsigned int        ki_cookie; /* for ->iopoll */
        struct wait_page_queue  *ki_waitq; /* for async buffered IO */
    };

    randomized_struct_fields_end
};
// ...
struct iov_iter {
    u8 iter_type;
    bool nofault;
    bool data_source;
    size_t iov_offset;
    size_t count;
    union {
        const struct iovec *iov;
        const struct kvec *kvec;
        const struct bio_vec *bvec;
        struct xarray *xarray;
        struct pipe_inode_info *pipe;
    };
    union {
        unsigned long nr_segs;
        struct {
            unsigned int head;
            unsigned int start_head;
        };
        loff_t xarray_start;
    };
};

之后用include/linux/fs.h中的call_read_iter交给文件操作中定义的例程read_iter:

static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
                     struct iov_iter *iter)
{
    return file->f_op->read_iter(kio, iter);
}

minix的例程使用通用的generic_file_read_iter

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:   kernel I/O control block
 * @iter:   destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
    size_t count = iov_iter_count(iter);
    ssize_t retval = 0;

    if (!count)
        return 0; /* skip atime */

    if (iocb->ki_flags & IOCB_DIRECT) {
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        loff_t size;

        size = i_size_read(inode);
        if (iocb->ki_flags & IOCB_NOWAIT) {
            if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
                        iocb->ki_pos + count - 1))
                return -EAGAIN;
        } else {
            retval = filemap_write_and_wait_range(mapping,
                        iocb->ki_pos,
                            iocb->ki_pos + count - 1);
            if (retval < 0)
                return retval;
        }

        file_accessed(file);

        retval = mapping->a_ops->direct_IO(iocb, iter);
        if (retval >= 0) {
            iocb->ki_pos += retval;
            count -= retval;
        }
        if (retval != -EIOCBQUEUED)
            iov_iter_revert(iter, count - iov_iter_count(iter));

        /*
         * Btrfs can have a short DIO read if we encounter
         * compressed extents, so if there was an error, or if
         * we've already read everything we wanted to, or if
         * there was a short read because we hit EOF, go ahead
         * and return.  Otherwise fallthrough to buffered io for
         * the rest of the read.  Buffered reads will not work for
         * DAX files, so don't bother trying.
         */
        if (retval < 0 || !count || iocb->ki_pos >= size ||
            IS_DAX(inode))
            return retval;
    }

    return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

先进行一下判断:

static inline size_t iov_iter_count(const struct iov_iter *i)
{
    return i->count;
}

如果设置了IOCB_DIRECT,使用direct_IO读取。这里如果设置了IOCB_NOWAIT,会不等待I/O请求且没有数据可读的时候直接返回-EAGAIN

下一步调用direct_IO,但是ext4minx都没有例化这个方法,均使用了noop_direct_IO,直接返回-EINVAL。仅有ext2, ocfs2, nilfs实现了自己的direct_IO方法。

下一种情况是从映射读取:

/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and readpage address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
        ssize_t already_read)
{
    struct file *filp = iocb->ki_filp;
    struct file_ra_state *ra = &filp->f_ra;
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct pagevec pvec;
    int i, error = 0;
    bool writably_mapped;
    loff_t isize, end_offset;

    if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
        return 0;
    if (unlikely(!iov_iter_count(iter)))
        return 0;

    iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    pagevec_init(&pvec);

    do {
        cond_resched();

        /*
         * If we've already successfully copied some data, then we
         * can no longer safely return -EIOCBQUEUED. Hence mark
         * an async read NOWAIT at that point.
         */
        if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
            iocb->ki_flags |= IOCB_NOWAIT;

        error = filemap_get_pages(iocb, iter, &pvec);
        if (error < 0)
            break;

        /*
         * i_size must be checked after we know the pages are Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */
        isize = i_size_read(inode);
        if (unlikely(iocb->ki_pos >= isize))
            goto put_pages;
        end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

        /*
         * Once we start copying data, we don't want to be touching any
         * cachelines that might be contended:
         */
        writably_mapped = mapping_writably_mapped(mapping);

        /*
         * When a sequential read accesses a page several times, only
         * mark it as accessed the first time.
         */
        if (iocb->ki_pos >> PAGE_SHIFT !=
            ra->prev_pos >> PAGE_SHIFT)
            mark_page_accessed(pvec.pages[0]);

        for (i = 0; i < pagevec_count(&pvec); i++) {
            struct page *page = pvec.pages[i];
            size_t page_size = thp_size(page);
            size_t offset = iocb->ki_pos & (page_size - 1);
            size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                         page_size - offset);
            size_t copied;

            if (end_offset < page_offset(page))
                break;
            if (i > 0)
                mark_page_accessed(page);
            /*
             * If users can be writing to this page using arbitrary
             * virtual addresses, take care about potential aliasing
             * before reading the page on the kernel side.
             */
            if (writably_mapped) {
                int j;

                for (j = 0; j < thp_nr_pages(page); j++)
                    flush_dcache_page(page + j);
            }

            copied = copy_page_to_iter(page, offset, bytes, iter);

            already_read += copied;
            iocb->ki_pos += copied;
            ra->prev_pos = iocb->ki_pos;

            if (copied < bytes) {
                error = -EFAULT;
                break;
            }
        }
put_pages:
        for (i = 0; i < pagevec_count(&pvec); i++)
            put_page(pvec.pages[i]);
        pagevec_reinit(&pvec);
    } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

    file_accessed(filp);

    return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);

进行一些判断之后进入一个while()循环

此时当前CPU处于内核态,如果负载很高,另一个优先级很高的进程需要运行时需要让出这部分CPU资源,要在抢占式内核中允许这种情况的存在(常用于桌面与嵌入式系统内核),但是在一般的服务器上,内核是不可抢占的,需要在while循环中要先执行cond_resched(),告诉内核调度可以主动让出CPU。

然后使用filemap_get_pages获取页。

使用copy_page_to_iter将迭代器中的数据利用管道传输进页中。

filemap_get_pages会先使用filemap_get_read_batch获取可读的一些页,如果没有,说明也不在页缓存中,那么就同步地去读,然后再判断一次,如果还不在,并且是IOCB_NOWAIT | IOCB_WAITQ的情况(不等待并且等待队列无效),那就返回-EAGAIN, 如果不是,尝试创建一个页,然会的错误是AOP_TRUNCATED_PAGE,页被锁了或者被截断了,那就重新尝试获取页。

static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
        struct pagevec *pvec)
{
    struct file *filp = iocb->ki_filp;
    struct address_space *mapping = filp->f_mapping;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
    pgoff_t last_index;
    struct page *page;
    int err = 0;

    last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
    if (fatal_signal_pending(current))
        return -EINTR;

    filemap_get_read_batch(mapping, index, last_index, pvec);
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & IOCB_NOIO)
            return -EAGAIN;
        page_cache_sync_readahead(mapping, ra, filp, index,
                last_index - index);
        filemap_get_read_batch(mapping, index, last_index, pvec);
    }
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
            return -EAGAIN;
        err = filemap_create_page(filp, mapping,
                iocb->ki_pos >> PAGE_SHIFT, pvec);
        if (err == AOP_TRUNCATED_PAGE)
            goto retry;
        return err;
    }

    page = pvec->pages[pagevec_count(pvec) - 1];
    if (PageReadahead(page)) {
        err = filemap_readahead(iocb, filp, mapping, page, last_index);
        if (err)
            goto err;
    }
    if (!PageUptodate(page)) {
        if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
            iocb->ki_flags |= IOCB_NOWAIT;
        err = filemap_update_page(iocb, mapping, iter, page);
        if (err)
            goto err;
    }

    return 0;
err:
    if (err < 0)
        put_page(page);
    if (likely(--pvec->nr))
        return 0;
    if (err == AOP_TRUNCATED_PAGE)
        goto retry;
    return err;
}

获取页之后,如果有异步的预读请求,那就异步地预读取页

检查是不是最新的

最后进行错误处理,没有问题就return 0

同步读取页缓存
/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
        unsigned long nr_to_read)
{
    struct address_space *mapping = ractl->mapping;
    struct file_ra_state *ra = ractl->ra;
    struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
    unsigned long max_pages, index;

    if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
            !mapping->a_ops->readahead))
        return;

    /*
     * If the request exceeds the readahead window, allow the read to
     * be up to the optimal hardware IO size
     */
    index = readahead_index(ractl);
    max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
    nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
    while (nr_to_read) {
        unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

        if (this_chunk > nr_to_read)
            this_chunk = nr_to_read;
        ractl->_index = index;
        do_page_cache_ra(ractl, this_chunk, 0);

        index += this_chunk;
        nr_to_read -= this_chunk;
    }
}
// ... 
void page_cache_sync_ra(struct readahead_control *ractl,
        unsigned long req_count)
{
    bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

    /*
     * Even if read-ahead is disabled, issue this request as read-ahead
     * as we'll need it to satisfy the requested range. The forced
     * read-ahead will do the right thing and limit the read to just the
     * requested range, which we'll set to 1 page for this case.
     */
    if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
        if (!ractl->file)
            return;
        req_count = 1;
        do_forced_ra = true;
    }

    /* be dumb */
    if (do_forced_ra) {
        force_page_cache_ra(ractl, req_count);
        return;
    }

    /* do read-ahead */
    ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
// ...
static inline
void page_cache_sync_readahead(struct address_space *mapping,
        struct file_ra_state *ra, struct file *file, pgoff_t index,
        unsigned long req_count)
{
    DEFINE_READAHEAD(ractl, file, ra, mapping, index);
    page_cache_sync_ra(&ractl, req_count);
}
异步读取页缓存
/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct readahead_control *ractl,
        bool hit_readahead_marker, unsigned long req_size)
{
    struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
    struct file_ra_state *ra = ractl->ra;
    unsigned long max_pages = ra->ra_pages;
    unsigned long add_pages;
    unsigned long index = readahead_index(ractl);
    pgoff_t prev_index;

    /*
     * If the request exceeds the readahead window, allow the read to
     * be up to the optimal hardware IO size
     */
    if (req_size > max_pages && bdi->io_pages > max_pages)
        max_pages = min(req_size, bdi->io_pages);

    /*
     * start of file
     */
    if (!index)
        goto initial_readahead;

    /*
     * It's the expected callback index, assume sequential access.
     * Ramp up sizes, and push forward the readahead window.
     */
    if ((index == (ra->start + ra->size - ra->async_size) ||
         index == (ra->start + ra->size))) {
        ra->start += ra->size;
        ra->size = get_next_ra_size(ra, max_pages);
        ra->async_size = ra->size;
        goto readit;
    }

    /*
     * Hit a marked page without valid readahead state.
     * E.g. interleaved reads.
     * Query the pagecache for async_size, which normally equals to
     * readahead size. Ramp it up and use it as the new readahead size.
     */
    if (hit_readahead_marker) {
        pgoff_t start;

        rcu_read_lock();
        start = page_cache_next_miss(ractl->mapping, index + 1,
                max_pages);
        rcu_read_unlock();

        if (!start || start - index > max_pages)
            return;

        ra->start = start;
        ra->size = start - index;    /* old async_size */
        ra->size += req_size;
        ra->size = get_next_ra_size(ra, max_pages);
        ra->async_size = ra->size;
        goto readit;
    }

    /*
     * oversize read
     */
    if (req_size > max_pages)
        goto initial_readahead;

    /*
     * sequential cache miss
     * trivial case: (index - prev_index) == 1
     * unaligned reads: (index - prev_index) == 0
     */
    prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
    if (index - prev_index <= 1UL)
        goto initial_readahead;

    /*
     * Query the page cache and look for the traces(cached history pages)
     * that a sequential stream would leave behind.
     */
    if (try_context_readahead(ractl->mapping, ra, index, req_size,
            max_pages))
        goto readit;

    /*
     * standalone, small random read
     * Read as is, and do not pollute the readahead state.
     */
    do_page_cache_ra(ractl, req_size, 0);
    return;

initial_readahead:
    ra->start = index;
    ra->size = get_init_ra_size(req_size, max_pages);
    ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
    /*
     * Will this read hit the readahead marker made by itself?
     * If so, trigger the readahead marker hit now, and merge
     * the resulted next readahead window into the current one.
     * Take care of maximum IO pages as above.
     */
    if (index == ra->start && ra->size == ra->async_size) {
        add_pages = get_next_ra_size(ra, max_pages);
        if (ra->size + add_pages <= max_pages) {
            ra->async_size = add_pages;
            ra->size += add_pages;
        } else {
            ra->size = max_pages;
            ra->async_size = max_pages >> 1;
        }
    }

    ractl->_index = ra->start;
    do_page_cache_ra(ractl, ra->size, ra->async_size);
}
// ...
void page_cache_async_ra(struct readahead_control *ractl,
        struct page *page, unsigned long req_count)
{
    /* no read-ahead */
    if (!ractl->ra->ra_pages)
        return;

    /*
     * Same bit is used for PG_readahead and PG_reclaim.
     */
    if (PageWriteback(page))
        return;

    ClearPageReadahead(page);

    /*
     * Defer asynchronous read-ahead on IO congestion.
     */
    if (inode_read_congested(ractl->mapping->host))
        return;

    if (blk_cgroup_congested())
        return;

    /* do read-ahead */
    ondemand_readahead(ractl, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
// ... 
static inline
void page_cache_async_readahead(struct address_space *mapping,
        struct file_ra_state *ra, struct file *file,
        struct page *page, pgoff_t index, unsigned long req_count)
{
    DEFINE_READAHEAD(ractl, file, ra, mapping, index);
    page_cache_async_ra(&ractl, page, req_count);
}
// ...
static int filemap_readahead(struct kiocb *iocb, struct file *file,
        struct address_space *mapping, struct page *page,
        pgoff_t last_index)
{
    if (iocb->ki_flags & IOCB_NOIO)
        return -EAGAIN;
    page_cache_async_readahead(mapping, &file->f_ra, file, page,
            page->index, last_index - page->index);
    return 0;
}
do_page_cache_ra

两种方式进行了相关的处理,然后都使用了do_page_cache_ra来进行真正的从磁盘上读取:

static void read_pages(struct readahead_control *rac, struct list_head *pages,
        bool skip_page)
{
    const struct address_space_operations *aops = rac->mapping->a_ops;
    struct page *page;
    struct blk_plug plug;

    if (!readahead_count(rac))
        goto out;

    blk_start_plug(&plug);

    if (aops->readahead) {
        aops->readahead(rac);
        /* Clean up the remaining pages */
        while ((page = readahead_page(rac))) {
            unlock_page(page);
            put_page(page);
        }
    } else if (aops->readpages) {
        aops->readpages(rac->file, rac->mapping, pages,
                readahead_count(rac));
        /* Clean up the remaining pages */
        put_pages_list(pages);
        rac->_index += rac->_nr_pages;
        rac->_nr_pages = 0;
    } else {
        while ((page = readahead_page(rac))) {
            aops->readpage(rac->file, page);
            put_page(page);
        }
    }

    blk_finish_plug(&plug);

    BUG_ON(!list_empty(pages));
    BUG_ON(readahead_count(rac));

out:
    if (skip_page)
        rac->_index++;
}
// ...
/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
        unsigned long nr_to_read, unsigned long lookahead_size)
{
    struct address_space *mapping = ractl->mapping;
    unsigned long index = readahead_index(ractl);
    LIST_HEAD(page_pool);
    gfp_t gfp_mask = readahead_gfp_mask(mapping);
    unsigned long i;

    /*
     * Partway through the readahead operation, we will have added
     * locked pages to the page cache, but will not yet have submitted
     * them for I/O.  Adding another page may need to allocate memory,
     * which can trigger memory reclaim.  Telling the VM we're in
     * the middle of a filesystem operation will cause it to not
     * touch file-backed pages, preventing a deadlock.  Most (all?)
     * filesystems already specify __GFP_NOFS in their mapping's
     * gfp_mask, but let's be explicit here.
     */
    unsigned int nofs = memalloc_nofs_save();

    filemap_invalidate_lock_shared(mapping);
    /*
     * Preallocate as many pages as we will need.
     */
    for (i = 0; i < nr_to_read; i++) {
        struct page *page = xa_load(&mapping->i_pages, index + i);

        if (page && !xa_is_value(page)) {
            /*
             * Page already present?  Kick off the current batch
             * of contiguous pages before continuing with the
             * next batch.  This page may be the one we would
             * have intended to mark as Readahead, but we don't
             * have a stable reference to this page, and it's
             * not worth getting one just for that.
             */
            read_pages(ractl, &page_pool, true);
            i = ractl->_index + ractl->_nr_pages - index - 1;
            continue;
        }

        page = __page_cache_alloc(gfp_mask);
        if (!page)
            break;
        if (mapping->a_ops->readpages) {
            page->index = index + i;
            list_add(&page->lru, &page_pool);
        } else if (add_to_page_cache_lru(page, mapping, index + i,
                    gfp_mask) < 0) {
            put_page(page);
            read_pages(ractl, &page_pool, true);
            i = ractl->_index + ractl->_nr_pages - index - 1;
            continue;
        }
        if (i == nr_to_read - lookahead_size)
            SetPageReadahead(page);
        ractl->_nr_pages++;
    }

    /*
     * Now start the IO.  We ignore I/O errors - if the page is not
     * uptodate then the caller will launch readpage again, and
     * will then handle the error.
     */
    read_pages(ractl, &page_pool, false);
    filemap_invalidate_unlock_shared(mapping);
    memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void do_page_cache_ra(struct readahead_control *ractl,
        unsigned long nr_to_read, unsigned long lookahead_size)
{
    struct inode *inode = ractl->mapping->host;
    unsigned long index = readahead_index(ractl);
    loff_t isize = i_size_read(inode);
    pgoff_t end_index;  /* The last page we want to read */

    if (isize == 0)
        return;

    end_index = (isize - 1) >> PAGE_SHIFT;
    if (index > end_index)
        return;
    /* Don't read past the page containing the last byte of the file */
    if (nr_to_read > end_index - index)
        nr_to_read = end_index - index + 1;

    page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

read_pages中如果page已经预读了,那就解锁页面,把现有的页全部读出来,然后对于没有预读到的页,调用块设备定义的操作address_space_operations中的read_page方法,来读取块设备,这个方法的实例为blkdev_readpage。一些函数已经不常用了,这里明明有一部分是预读,但是最后都变成了读页。

static int blkdev_readpage(struct file * file, struct page * page)
{
    return block_read_full_page(page, blkdev_get_block);
}

fs/buffer.c中,从块设备中读取,这是一个通用的方法,文件系统需要提供get_block即可,mpage中的方法注释说明已经不常用了:

/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
 */
int block_read_full_page(struct page *page, get_block_t *get_block)
{
    struct inode *inode = page->mapping->host;
    sector_t iblock, lblock;
    struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
    unsigned int blocksize, bbits;
    int nr, i;
    int fully_mapped = 1;

    head = create_page_buffers(page, inode, 0);
    blocksize = head->b_size;
    bbits = block_size_bits(blocksize);

    iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
    lblock = (i_size_read(inode)+blocksize-1) >> bbits;
    bh = head;
    nr = 0;
    i = 0;

    do {
        if (buffer_uptodate(bh))
            continue;

        if (!buffer_mapped(bh)) {
            int err = 0;

            fully_mapped = 0;
            if (iblock < lblock) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                    SetPageError(page);
            }
            if (!buffer_mapped(bh)) {
                zero_user(page, i * blocksize, blocksize);
                if (!err)
                    set_buffer_uptodate(bh);
                continue;
            }
            /*
             * get_block() might have updated the buffer
             * synchronously
             */
            if (buffer_uptodate(bh))
                continue;
        }
        arr[nr++] = bh;
    } while (i++, iblock++, (bh = bh->b_this_page) != head);

    if (fully_mapped)
        SetPageMappedToDisk(page);

    if (!nr) {
        /*
         * All buffers are uptodate - we can set the page uptodate
         * as well. But not if get_block() returned an error.
         */
        if (!PageError(page))
            SetPageUptodate(page);
        unlock_page(page);
        return 0;
    }

    /* Stage two: lock the buffers */
    for (i = 0; i < nr; i++) {
        bh = arr[i];
        lock_buffer(bh);
        mark_buffer_async_read(bh);
    }

    /*
     * Stage 3: start the IO.  Check for uptodateness
     * inside the buffer lock in case another process reading
     * the underlying blockdev brought it uptodate (the sct fix).
     */
    for (i = 0; i < nr; i++) {
        bh = arr[i];
        if (buffer_uptodate(bh))
            end_buffer_async_read(bh, 1);
        else
            submit_bh(REQ_OP_READ, 0, bh);
    }
    return 0;
}
EXPORT_SYMBOL(block_read_full_page);

这个通用的函数的处理过程要保证所有的页都是最新的。

所有先搜集非buffer_uptodate的页,然后锁住块缓存,设置读完成以后的回调函数mark_buffer_async_read(),最后开始IO。

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
    unsigned long flags;
    struct buffer_head *first;
    struct buffer_head *tmp;
    struct page *page;
    int page_uptodate = 1;

    BUG_ON(!buffer_async_read(bh));

    page = bh->b_page;
    if (uptodate) {
        set_buffer_uptodate(bh);
    } else {
        clear_buffer_uptodate(bh);
        buffer_io_error(bh, ", async page read");
        SetPageError(page);
    }

    /*
     * Be _very_ careful from here on. Bad things can happen if
     * two buffer heads end IO at almost the same time and both
     * decide that the page is now completely done.
     */
    first = page_buffers(page);
    spin_lock_irqsave(&first->b_uptodate_lock, flags);
    clear_buffer_async_read(bh);
    unlock_buffer(bh);
    tmp = bh;
    do {
        if (!buffer_uptodate(tmp))
            page_uptodate = 0;
        if (buffer_async_read(tmp)) {
            BUG_ON(!buffer_locked(tmp));
            goto still_busy;
        }
        tmp = tmp->b_this_page;
    } while (tmp != bh);
    spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

    /*
     * If none of the buffers had errors and they are all
     * uptodate then we can set the page uptodate.
     */
    if (page_uptodate && !PageError(page))
        SetPageUptodate(page);
    unlock_page(page);
    return;

still_busy:
    spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
    return;
}
file->f_op->read(file, buf, count, pos)

cifs,fuse,ceph,9p等文件系统对这个方法进行了实例化,所以大部分都是使用的上面的方法读。

write()系统调用

read类似

lseek()系统调用

这个系统调用在unistd.h中,使用时需要提供一个文件描述符,偏置和文件指针位置。

内核的系统调用为:

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
    off_t retval;
    struct fd f = fdget_pos(fd);
    if (!f.file)
        return -EBADF;

    retval = -EINVAL;
    if (whence <= SEEK_MAX) {
        loff_t res = vfs_llseek(f.file, offset, whence);
        retval = res;
        if (res != (loff_t)retval)
            retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
    }
    fdput_pos(f);
    return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
    return ksys_lseek(fd, offset, whence);
}

同样先是根据文件描述符找到对应的文件,然后再判断一下有无超范围,最后使用vfs_llseek系统调用。

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
    loff_t (*fn)(struct file *, loff_t, int);

    fn = no_llseek;
    if (file->f_mode & FMODE_LSEEK) {
        if (file->f_op->llseek)
            fn = file->f_op->llseek;
    }
    return fn(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

这里的loff_t其实是long int类型

这里调用了llseek,minx的例程是generic_file_llseek

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:   file structure to seek on
 * @offset: file offset to seek to
 * @whence: type of seek
 * @size:   max size of this file in file system
 * @eof:    offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
        loff_t maxsize, loff_t eof)
{
    switch (whence) {
    case SEEK_END:
        offset += eof;
        break;
    case SEEK_CUR:
        /*
         * Here we special-case the lseek(fd, 0, SEEK_CUR)
         * position-querying operation.  Avoid rewriting the "same"
         * f_pos value back to the file because a concurrent read(),
         * write() or lseek() might have altered it
         */
        if (offset == 0)
            return file->f_pos;
        /*
         * f_lock protects against read/modify/write race with other
         * SEEK_CURs. Note that parallel writes and reads behave
         * like SEEK_SET.
         */
        spin_lock(&file->f_lock);
        offset = vfs_setpos(file, file->f_pos + offset, maxsize);
        spin_unlock(&file->f_lock);
        return offset;
    case SEEK_DATA:
        /*
         * In the generic case the entire file is data, so as long as
         * offset isn't at the end of the file then the offset is data.
         */
        if ((unsigned long long)offset >= eof)
            return -ENXIO;
        break;
    case SEEK_HOLE:
        /*
         * There is a virtual hole at the end of the file, so as long as
         * offset isn't i_size or larger, return i_size.
         */
        if ((unsigned long long)offset >= eof)
            return -ENXIO;
        offset = eof;
        break;
    }

    return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:   file structure to seek on
 * @offset: file offset to seek to
 * @whence: type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
    struct inode *inode = file->f_mapping->host;

    return generic_file_llseek_size(file, offset, whence,
                    inode->i_sb->s_maxbytes,
                    i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

然后又调用了generic_file_llseek_size这个系统调用,根据指针类型的不同,优化一下,处理边界,然后调用vfs_setpos

/**
 * vfs_setpos - update the file offset for lseek
 * @file:   file structure in question
 * @offset: file offset to seek to
 * @maxsize:    maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
    if (offset < 0 && !unsigned_offsets(file))
        return -EINVAL;
    if (offset > maxsize)
        return -EINVAL;

    if (offset != file->f_pos) {
        file->f_pos = offset;
        file->f_version = 0;
    }
    return offset;
}
EXPORT_SYMBOL(vfs_setpos);
知识共享许可协议
本作品采用知识共享署名-相同方式共享 4.0 国际许可协议进行许可。
上一篇
下一篇