信号 2017-02-26

1. 发送

以tkill为例来说明. linux-3.10.86/kernel/signal.c 假定不是发送给组的, 即__send_signal()的@group为0

do_tkill -> do_send_specific
|--find_task_by_vpid
|--check_kill_permission
|--do_send_sig_info -> send_signal
|   |--__send_signal


__send_signal
|--q=__sigqueue_alloc //new sigqueue instance
|--list_add_tail(&q->list, &pending->list);
|--设置q->info的各域
|   |--q->info.si_signo =
|   |--q->info.si_pid =
|--complete_signal
|   |--signal_wake_up -> signal_wake_up_state
|   |   |--set_tsk_thread_flag(t, TIF_SIGPENDING);
|   |   |--wake_up_state(, |TASK_INTERRUPTIBLE) -> try_to_wake_up

fixup_exception 2017-02-25

1. 什么情况下会调用fixup_exception

linux-3.10.86/arch/arm/mm/fault.c

static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{

    if (!user_mode(regs))
        goto no_context;


no_context:
    __do_kernel_fault(mm, addr, fsr, regs);
    return 0;
}

所以, 用户态(usermode(regs))发起的 读或写 , copy to/from user等不会调用到 _dokernelfault, 也就不会调用fixupexception的. (fixupexception 仅会被 _dokernel_fault调用.)

do_fork和COW 2017-02-24

1. 问题引入

摘自APUE:

由于在fork之后进程跟随着exec, 所以现在的很多实现并不执行一个父进程数据段, 栈和堆的完全复制. 作为替代, 使用了COW技术. 这些区域由父子进程共享, 而且内核将它们的访问权限改变为只读的. 如果父子进程中的任一个试图修改这些区域, 则内核只为修改区域的那块内存制作一个副本, 通常是虚拟存储系统中的一"页".

问题:哪里设置只读, 如何知道只读?

2. 解

快速tips:
PTE entry is marked as un-writeable.
But VMA is marked as writeable.

copyonepte()会调用 ptepsetwrprotect()

还是先从dup_mm()开始看吧.
linux-3.10.86/kernel/fork.c

mmap和fault handler 2017-02-24

1. 问题引入

我们想知道, 通过mmap映射文件, 然后读取, 是如何读取到文件的, 其中的fault handler相关的内容.

2. ftrace

#include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>

#define MEMSIZE 1024*4
#define MPFILE "./.ash_history"

//#define ON_OFF //tracing_on

void write_ftrace(char *path, char *val)
{
    char ftrace_d[50]="/sys/kernel/debug/tracing/";
    int len_d=strlen(ftrace_d);


    int fd_trace=open(strcat(ftrace_d,path), O_RDWR|O_TRUNC);
    if (fd_trace < 0) {
        printf("open %s:%s\n",path,strerror(errno));
        exit(1);
    }

    printf("to write:%s %s\n",path,val);

    int ret=write(fd_trace, val, strlen(val));
    if(ret<0)
    {
        printf("write %s %s\n",path,strerror(errno));
        exit(1);
    }


    if (0==strncmp("trace",path, sizeof("trace")))
    {
        close(fd_trace);
        return;
    }

    char val_read[20];
    memset(val_read,0,sizeof(val_read));
    pread(fd_trace, val_read, sizeof(val_read),0);
    printf("read val:%s\n",val_read);
    close(fd_trace);

}

void write_val(char *path, char *val)
{
    int fd=open(path, O_RDWR);
    if (fd < 0) {
        printf("open %s:%s\n",path,strerror(errno));
        exit(1);
    }

    printf("to write:%s %s\n",path, val);
    int ret=write(fd, val, strlen(val));
    if(ret<0)
    {
        printf("write %s %s\n",path,strerror(errno));
        exit(1);
    }

    char val_read[20];
    memset(val_read,0,sizeof(val_read));
    pread(fd, val_read, sizeof(val_read),0);
    printf("read val:%s\n",val_read);
    close(fd);
}


int main()
{
    char *ptr;
    int fd;
    int ret=-1;
    int i=-1;

    fd = open(MPFILE, O_RDWR);
    if (fd < 0) {
        perror("open()");
        exit(1);
    }

    ptr = mmap(NULL, MEMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    if (ptr == NULL) {
        perror("malloc()");
        exit(1);
    }

    /*避免page cache干扰*/
    if( posix_fadvise(fd,0,4096,POSIX_FADV_DONTNEED) != 0) {  
        printf("Cache FADV_DONTNEED failed, %s\n",strerror(errno));  
    }  
    else {  
        printf("Cache FADV_DONTNEED done\n");  
    }  

    //mount -t debugfs nodev /sys/kernel/debug/
    system("mount -t debugfs nodev /sys/kernel/debug/");



    /*
    ~ # echo 1 > /proc/sys/kernel/ftrace_enabled
    */

    write_val("/proc/sys/kernel/ftrace_enabled", "1");

    int pid_cur=getpid();
    printf("pid:%d\n",pid_cur);
    char str_pid[25];
    memset(str_pid,0, sizeof(str_pid));
    snprintf(str_pid, sizeof(str_pid),"%d", pid_cur);
    //  /sys/kernel/debug/tracing/set_ftrace_pid
    write_ftrace("set_ftrace_pid", str_pid);


    write_ftrace("tracing_on", "1");


    /*# can set other filtering here
    echo function > ${ROOT_FTRACE}/current_tracer
    */
    write_ftrace("current_tracer", "function");

    write_ftrace("trace", "0");

    /*
    #echo 0 > ${ROOT_FTRACE}/trace
    echo start_trace_marker > ${ROOT_FTRACE}/trace_marker
    exec $* 
    */

    //write_ftrace("trace_marker", "start_trace_marker");

   i=*ptr;  //这个就是我们要跟踪的


    /*
    echo end_trace_marker > ${ROOT_FTRACE}/trace_marker
    echo 0  > ${ROOT_FTRACE}/tracing_on  #stop record to buffer
    */

    //write_ftrace("trace_marker", "end_trace_marker");

    write_ftrace("tracing_on", "0");
    system("cat /sys/kernel/debug/tracing/trace > mmap.ftrace");

    printf("%x\n", i);


    munmap(ptr, MEMSIZE);
    close(fd);

    exit(1);
}

关于THREAD_START_SP 2017-02-23

1. 问题引入

https://awakening-fong.github.io/posts/arm/arm_qemu_02 中说到 startkernel前设置sp为 initthreadunion + THREADSTART_SP

#define THREAD_START_SP (THREAD_SIZE - 8)

为何要-8?

2. 解

static inline struct thread_info *current_thread_info(void)
{
    register unsigned long sp asm ("sp");
    return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
}

(图示中A点到B点 大小是8字节)
如果sp是SP0, 那么,(sp & ~(THREADSIZE - 1)) 后还是指向SP0 (A点);
如果sp是SP1, 那么, (sp & ~(THREAD
SIZE - 1)) 后指向的是thread_info (C点).
所以, 需要-8.