1. mmap_pgoff的系统调用实现如下
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,unsigned long, prot, unsigned long, flags,unsigned long, fd, unsigned long, pgoff)
{return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}
2. ksys_mmap_pgoff函数
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,unsigned long prot, unsigned long flags,unsigned long fd, unsigned long pgoff)
{struct file *file = NULL;unsigned long retval;if (!(flags & MAP_ANONYMOUS)) {audit_mmap_fd(fd, flags);file = fget(fd);if (!file)return -EBADF;if (is_file_hugepages(file)) {len = ALIGN(len, huge_page_size(hstate_file(file)));} else if (unlikely(flags & MAP_HUGETLB)) {retval = -EINVAL;goto out_fput;}} else if (flags & MAP_HUGETLB) {struct hstate *hs;hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (!hs)return -EINVAL;len = ALIGN(len, huge_page_size(hs));/** VM_NORESERVE is used because the reservations will be* taken when vm_ops->mmap() is called*/file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,VM_NORESERVE,HUGETLB_ANONHUGE_INODE,(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (IS_ERR(file))return PTR_ERR(file);}retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:if (file)fput(file);return retval;
}
vm_mmap_pgoff函数
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flag, unsigned long pgoff)
{unsigned long ret;struct mm_struct *mm = current->mm;unsigned long populate;LIST_HEAD(uf);ret = security_mmap_file(file, prot, flag);if (!ret) {if (mmap_write_lock_killable(mm))return -EINTR;ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,&uf);mmap_write_unlock(mm);userfaultfd_unmap_complete(mm, &uf);if (populate)mm_populate(ret, populate);}return ret;
}
do_mmap函数
/** The caller must write-lock current->mm->mmap_lock.*/
unsigned long do_mmap(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flags, vm_flags_t vm_flags,unsigned long pgoff, unsigned long *populate,struct list_head *uf)
{struct mm_struct *mm = current->mm;int pkey = 0;*populate = 0;if (!len)return -EINVAL;/** Does the application expect PROT_READ to imply PROT_EXEC?** (the exception is when the underlying filesystem is noexec* mounted, in which case we dont add PROT_EXEC.)*/if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))if (!(file && path_noexec(&file->f_path)))prot |= PROT_EXEC;/* force arch specific MAP_FIXED handling in get_unmapped_area */if (flags & MAP_FIXED_NOREPLACE)flags |= MAP_FIXED;if (!(flags & MAP_FIXED))addr = round_hint_to_min(addr);/* Careful about overflows.. */len = PAGE_ALIGN(len);if (!len)return -ENOMEM;/* offset overflow? */if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)return -EOVERFLOW;/* Too many mappings? */if (mm->map_count > sysctl_max_map_count)return -ENOMEM;/* Obtain the address to map to. we verify (or select) it and ensure* that it represents a valid section of the address space.*/addr = get_unmapped_area(file, addr, len, pgoff, flags);if (IS_ERR_VALUE(addr))return addr;if (flags & MAP_FIXED_NOREPLACE) {if (find_vma_intersection(mm, addr, addr + len))return -EEXIST;}if (prot == PROT_EXEC) {pkey = execute_only_pkey(mm);if (pkey < 0)pkey = 0;}/* Do simple checking here so the lower-level routines won't have* to. we assume access permissions have been handled by the open* of the memory object, so we don't do any here.*/vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;if (flags & MAP_LOCKED)if (!can_do_mlock())return -EPERM;if (!mlock_future_ok(mm, vm_flags, len))return -EAGAIN;if (file) {struct inode *inode = file_inode(file);unsigned long flags_mask;if (!file_mmap_ok(file, inode, pgoff, len))return -EOVERFLOW;flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;switch (flags & MAP_TYPE) {case MAP_SHARED:/** Force use of MAP_SHARED_VALIDATE with non-legacy* flags. E.g. MAP_SYNC is dangerous to use with* MAP_SHARED as you don't know which consistency model* you will get. We silently ignore unsupported flags* with MAP_SHARED to preserve backward compatibility.*/flags &= LEGACY_MAP_MASK;fallthrough;case MAP_SHARED_VALIDATE:if (flags & ~flags_mask)return -EOPNOTSUPP;if (prot & PROT_WRITE) {if (!(file->f_mode & FMODE_WRITE))return -EACCES;if (IS_SWAPFILE(file->f_mapping->host))return -ETXTBSY;}/** Make sure we don't allow writing to an append-only* file..*/if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))return -EACCES;vm_flags |= VM_SHARED | VM_MAYSHARE;if (!(file->f_mode & FMODE_WRITE))vm_flags &= ~(VM_MAYWRITE | VM_SHARED);fallthrough;case MAP_PRIVATE:if (!(file->f_mode & FMODE_READ))return -EACCES;if (path_noexec(&file->f_path)) {if (vm_flags & VM_EXEC)return -EPERM;vm_flags &= ~VM_MAYEXEC;}if (!file->f_op->mmap)return -ENODEV;if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))return -EINVAL;break;default:return -EINVAL;}} else {switch (flags & MAP_TYPE) {case MAP_SHARED:if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))return -EINVAL;/** Ignore pgoff.*/pgoff = 0;vm_flags |= VM_SHARED | VM_MAYSHARE;break;case MAP_PRIVATE:/** Set pgoff according to addr for anon_vma.*/pgoff = addr >> PAGE_SHIFT;break;default:return -EINVAL;}}/** Set 'VM_NORESERVE' if we should not account for the* memory use of this mapping.*/if (flags & MAP_NORESERVE) {/* We honor MAP_NORESERVE if allowed to overcommit */if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)vm_flags |= VM_NORESERVE;/* hugetlb applies strict overcommit unless MAP_NORESERVE */if (file && is_file_hugepages(file))vm_flags |= VM_NORESERVE;}addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);if (!IS_ERR_VALUE(addr) &&((vm_flags & VM_LOCKED) ||(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))*populate = len;return addr;
}
get_unmapped_area函数
do_mmap的函数中的会使用get_unmapped_area获取addr
get_unmapped_area
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,unsigned long pgoff, unsigned long flags)
{unsigned long (*get_area)(struct file *, unsigned long,unsigned long, unsigned long, unsigned long);unsigned long error = arch_mmap_check(addr, len, flags);if (error)return error;/* Careful about overflows.. */if (len > TASK_SIZE)return -ENOMEM;get_area = current->mm->get_unmapped_area;if (file) {if (file->f_op->get_unmapped_area)get_area = file->f_op->get_unmapped_area;} else if (flags & MAP_SHARED) {/** mmap_region() will call shmem_zero_setup() to create a file,* so use shmem's get_unmapped_area in case it can be huge.* do_mmap() will clear pgoff, so match alignment.*/pgoff = 0;get_area = shmem_get_unmapped_area;}addr = get_area(file, addr, len, pgoff, flags);if (IS_ERR_VALUE(addr))return addr;if (addr > TASK_SIZE - len)return -ENOMEM;if (offset_in_page(addr))return -EINVAL;error = security_mmap_addr(addr);return error ? error : addr;
}
mmap_region函数
get_unmapped_area获取到addr,之后addr传给mmap_region函数
unsigned long mmap_region(struct file *file, unsigned long addr,unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,struct list_head *uf)
{struct mm_struct *mm = current->mm;struct vm_area_struct *vma = NULL;struct vm_area_struct *next, *prev, *merge;pgoff_t pglen = len >> PAGE_SHIFT;unsigned long charged = 0;unsigned long end = addr + len;unsigned long merge_start = addr, merge_end = end;pgoff_t vm_pgoff;int error;VMA_ITERATOR(vmi, mm, addr);/* Check against address space limit. */if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {unsigned long nr_pages;/** MAP_FIXED may remove pages of mappings that intersects with* requested mapping. Account for the pages it would unmap.*/nr_pages = count_vma_pages_range(mm, addr, end);if (!may_expand_vm(mm, vm_flags,(len >> PAGE_SHIFT) - nr_pages))return -ENOMEM;}/* Unmap any existing mapping in the area */if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))return -ENOMEM;/** Private writable mapping: check memory availability*/if (accountable_mapping(file, vm_flags)) {charged = len >> PAGE_SHIFT;if (security_vm_enough_memory_mm(mm, charged))return -ENOMEM;vm_flags |= VM_ACCOUNT;}next = vma_next(&vmi);prev = vma_prev(&vmi);if (vm_flags & VM_SPECIAL) {if (prev)vma_iter_next_range(&vmi);goto cannot_expand;}/* Attempt to expand an old mapping *//* Check next */if (next && next->vm_start == end && !vma_policy(next) &&can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,NULL_VM_UFFD_CTX, NULL)) {merge_end = next->vm_end;vma = next;vm_pgoff = next->vm_pgoff - pglen;}/* Check prev */if (prev && prev->vm_end == addr && !vma_policy(prev) &&(vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,pgoff, vma->vm_userfaultfd_ctx, NULL) :can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,NULL_VM_UFFD_CTX, NULL))) {merge_start = prev->vm_start;vma = prev;vm_pgoff = prev->vm_pgoff;} else if (prev) {vma_iter_next_range(&vmi);}/* Actually expand, if possible */if (vma &&!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {khugepaged_enter_vma(vma, vm_flags);goto expanded;}if (vma == prev)vma_iter_set(&vmi, addr);
cannot_expand:/** Determine the object being mapped and call the appropriate* specific mapper. the address has already been validated, but* not unmapped, but the maps are removed from the list.*/vma = vm_area_alloc(mm);if (!vma) {error = -ENOMEM;goto unacct_error;}vma_iter_config(&vmi, addr, end);vma->vm_start = addr;vma->vm_end = end;vm_flags_init(vma, vm_flags);vma->vm_page_prot = vm_get_page_prot(vm_flags);vma->vm_pgoff = pgoff;if (file) {if (vm_flags & VM_SHARED) {error = mapping_map_writable(file->f_mapping);if (error)goto free_vma;}vma->vm_file = get_file(file);error = call_mmap(file, vma);if (error)goto unmap_and_free_vma;/** Expansion is handled above, merging is handled below.* Drivers should not alter the address of the VMA.*/error = -EINVAL;if (WARN_ON((addr != vma->vm_start)))goto close_and_free_vma;vma_iter_config(&vmi, addr, end);/** If vm_flags changed after call_mmap(), we should try merge* vma again as we may succeed this time.*/if (unlikely(vm_flags != vma->vm_flags && prev)) {merge = vma_merge(&vmi, mm, prev, vma->vm_start,vma->vm_end, vma->vm_flags, NULL,vma->vm_file, vma->vm_pgoff, NULL,NULL_VM_UFFD_CTX, NULL);if (merge) {/** ->mmap() can change vma->vm_file and fput* the original file. So fput the vma->vm_file* here or we would add an extra fput for file* and cause general protection fault* ultimately.*/fput(vma->vm_file);vm_area_free(vma);vma = merge;/* Update vm_flags to pick up the change. */vm_flags = vma->vm_flags;goto unmap_writable;}}vm_flags = vma->vm_flags;} else if (vm_flags & VM_SHARED) {error = shmem_zero_setup(vma);if (error)goto free_vma;} else {vma_set_anonymous(vma);}if (map_deny_write_exec(vma, vma->vm_flags)) {error = -EACCES;goto close_and_free_vma;}/* Allow architectures to sanity-check the vm_flags */error = -EINVAL;if (!arch_validate_flags(vma->vm_flags))goto close_and_free_vma;error = -ENOMEM;if (vma_iter_prealloc(&vmi, vma))goto close_and_free_vma;/* Lock the VMA since it is modified after insertion into VMA tree */vma_start_write(vma);vma_iter_store(&vmi, vma);mm->map_count++;if (vma->vm_file) {i_mmap_lock_write(vma->vm_file->f_mapping);if (vma->vm_flags & VM_SHARED)mapping_allow_writable(vma->vm_file->f_mapping);flush_dcache_mmap_lock(vma->vm_file->f_mapping);vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);flush_dcache_mmap_unlock(vma->vm_file->f_mapping);i_mmap_unlock_write(vma->vm_file->f_mapping);}/** vma_merge() calls khugepaged_enter_vma() either, the below* call covers the non-merge case.*/khugepaged_enter_vma(vma, vma->vm_flags);/* Once vma denies write, undo our temporary denial count */
unmap_writable:if (file && vm_flags & VM_SHARED)mapping_unmap_writable(file->f_mapping);file = vma->vm_file;ksm_add_vma(vma);
expanded:perf_event_mmap(vma);vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);if (vm_flags & VM_LOCKED) {if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||is_vm_hugetlb_page(vma) ||vma == get_gate_vma(current->mm))vm_flags_clear(vma, VM_LOCKED_MASK);elsemm->locked_vm += (len >> PAGE_SHIFT);}if (file)uprobe_mmap(vma);/** New (or expanded) vma always get soft dirty status.* Otherwise user-space soft-dirty page tracker won't* be able to distinguish situation when vma area unmapped,* then new mapped in-place (which must be aimed as* a completely new data area).*/vm_flags_set(vma, VM_SOFTDIRTY);vma_set_page_prot(vma);validate_mm(mm);return addr;close_and_free_vma:if (file && vma->vm_ops && vma->vm_ops->close)vma->vm_ops->close(vma);if (file || vma->vm_file) {
unmap_and_free_vma:fput(vma->vm_file);vma->vm_file = NULL;vma_iter_set(&vmi, vma->vm_end);/* Undo any partial mapping done by a device driver. */unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,vma->vm_end, vma->vm_end, true);}if (file && (vm_flags & VM_SHARED))mapping_unmap_writable(file->f_mapping);
free_vma:vm_area_free(vma);
unacct_error:if (charged)vm_unacct_memory(charged);validate_mm(mm);return error;
}