信号 2017-02-26

1. 发送

以tkill为例来说明. linux-3.10.86/kernel/signal.c 假定不是发送给组的, 即__send_signal()的@group为0

do_tkill -> do_send_specific
|--find_task_by_vpid
|--check_kill_permission
|--do_send_sig_info -> send_signal
|   |--__send_signal


__send_signal
|--q=__sigqueue_alloc //new sigqueue instance
|--list_add_tail(&q->list, &pending->list);
|--设置q->info的各域
|   |--q->info.si_signo =
|   |--q->info.si_pid =
|--complete_signal
|   |--signal_wake_up -> signal_wake_up_state
|   |   |--set_tsk_thread_flag(t, TIF_SIGPENDING);
|   |   |--wake_up_state(, |TASK_INTERRUPTIBLE) -> try_to_wake_up

fixup_exception 2017-02-25

1. 什么情况下会调用fixup_exception

linux-3.10.86/arch/arm/mm/fault.c

static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{

    if (!user_mode(regs))
        goto no_context;


no_context:
    __do_kernel_fault(mm, addr, fsr, regs);
    return 0;
}

所以, 用户态(usermode(regs))发起的 读或写 , copy to/from user等不会调用到 _dokernelfault, 也就不会调用fixupexception的. (fixupexception 仅会被 _dokernel_fault调用.)

do_fork和COW 2017-02-24

1. 问题引入

摘自APUE:

由于在fork之后进程跟随着exec, 所以现在的很多实现并不执行一个父进程数据段, 栈和堆的完全复制. 作为替代, 使用了COW技术. 这些区域由父子进程共享, 而且内核将它们的访问权限改变为只读的. 如果父子进程中的任一个试图修改这些区域, 则内核只为修改区域的那块内存制作一个副本, 通常是虚拟存储系统中的一"页".

问题:哪里设置只读, 如何知道只读?

2. 解

快速tips:
PTE entry is marked as un-writeable.
But VMA is marked as writeable.

copyonepte()会调用 ptepsetwrprotect()

还是先从dup_mm()开始看吧.
linux-3.10.86/kernel/fork.c

mmap和fault handler 2017-02-24

1. 问题引入

我们想知道, 通过mmap映射文件, 然后读取, 是如何读取到文件的, 其中的fault handler相关的内容.

2. ftrace

#include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>

#define MEMSIZE 1024*4
#define MPFILE "./.ash_history"

//#define ON_OFF //tracing_on

void write_ftrace(char *path, char *val)
{
    char ftrace_d[50]="/sys/kernel/debug/tracing/";
    int len_d=strlen(ftrace_d);


    int fd_trace=open(strcat(ftrace_d,path), O_RDWR|O_TRUNC);
    if (fd_trace < 0) {
        printf("open %s:%s\n",path,strerror(errno));
        exit(1);
    }

    printf("to write:%s %s\n",path,val);

    int ret=write(fd_trace, val, strlen(val));
    if(ret<0)
    {
        printf("write %s %s\n",path,strerror(errno));
        exit(1);
    }


    if (0==strncmp("trace",path, sizeof("trace")))
    {
        close(fd_trace);
        return;
    }

    char val_read[20];
    memset(val_read,0,sizeof(val_read));
    pread(fd_trace, val_read, sizeof(val_read),0);
    printf("read val:%s\n",val_read);
    close(fd_trace);

}

void write_val(char *path, char *val)
{
    int fd=open(path, O_RDWR);
    if (fd < 0) {
        printf("open %s:%s\n",path,strerror(errno));
        exit(1);
    }

    printf("to write:%s %s\n",path, val);
    int ret=write(fd, val, strlen(val));
    if(ret<0)
    {
        printf("write %s %s\n",path,strerror(errno));
        exit(1);
    }

    char val_read[20];
    memset(val_read,0,sizeof(val_read));
    pread(fd, val_read, sizeof(val_read),0);
    printf("read val:%s\n",val_read);
    close(fd);
}


int main()
{
    char *ptr;
    int fd;
    int ret=-1;
    int i=-1;

    fd = open(MPFILE, O_RDWR);
    if (fd < 0) {
        perror("open()");
        exit(1);
    }

    ptr = mmap(NULL, MEMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    if (ptr == NULL) {
        perror("malloc()");
        exit(1);
    }

    /*避免page cache干扰*/
    if( posix_fadvise(fd,0,4096,POSIX_FADV_DONTNEED) != 0) {  
        printf("Cache FADV_DONTNEED failed, %s\n",strerror(errno));  
    }  
    else {  
        printf("Cache FADV_DONTNEED done\n");  
    }  

    //mount -t debugfs nodev /sys/kernel/debug/
    system("mount -t debugfs nodev /sys/kernel/debug/");



    /*
    ~ # echo 1 > /proc/sys/kernel/ftrace_enabled
    */

    write_val("/proc/sys/kernel/ftrace_enabled", "1");

    int pid_cur=getpid();
    printf("pid:%d\n",pid_cur);
    char str_pid[25];
    memset(str_pid,0, sizeof(str_pid));
    snprintf(str_pid, sizeof(str_pid),"%d", pid_cur);
    //  /sys/kernel/debug/tracing/set_ftrace_pid
    write_ftrace("set_ftrace_pid", str_pid);


    write_ftrace("tracing_on", "1");


    /*# can set other filtering here
    echo function > ${ROOT_FTRACE}/current_tracer
    */
    write_ftrace("current_tracer", "function");

    write_ftrace("trace", "0");

    /*
    #echo 0 > ${ROOT_FTRACE}/trace
    echo start_trace_marker > ${ROOT_FTRACE}/trace_marker
    exec $* 
    */

    //write_ftrace("trace_marker", "start_trace_marker");

   i=*ptr;  //这个就是我们要跟踪的


    /*
    echo end_trace_marker > ${ROOT_FTRACE}/trace_marker
    echo 0  > ${ROOT_FTRACE}/tracing_on  #stop record to buffer
    */

    //write_ftrace("trace_marker", "end_trace_marker");

    write_ftrace("tracing_on", "0");
    system("cat /sys/kernel/debug/tracing/trace > mmap.ftrace");

    printf("%x\n", i);


    munmap(ptr, MEMSIZE);
    close(fd);

    exit(1);
}

模式 01: 内核中常见的性能优化方法 2017-02-23

1. 批量处理

add_to_page_cache_lru, 操作lru的话, 先放到per cpu的lru cache (struct pagevec), 待lru cache存满后drain.

2. 吞吐量(Throughput)和时延(Latency)

2.1 HZ 设置为 250, 1000等.

2.2 中断线程化后, 吞吐量可能下降, 但系统其它地方的时延可能改善.

3. 用空间换时间

计算结果缓存之类的, 比如 一些数值计算, 开根号什么的.

4. 收敛?

radix tree的tag, 不必逐个判断该节点 子树 下的叶子. 这个可能也可以归到 缓存 中.

hash应该也算这个类别.