2016-03-31
张超《Linux内核分析》MOOC课程
Linux如何创建一个新进程
1.我们先阅读理解task_struct数据结构

1235struct task_struct {1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */1237 void *stack;1238 atomic_t usage;1239 unsigned int flags; /* per process flags, defined below */1240 unsigned int ptrace;12411242#ifdef CONFIG_SMP1243 struct llist_node wake_entry;1244 int on_cpu;1245 struct task_struct *last_wakee;1246 unsigned long wakee_flips;1247 unsigned long wakee_flip_decay_ts;12481249 int wake_cpu;1250#endif1251 int on_rq;12521253 int prio, static_prio, normal_prio;1254 unsigned int rt_priority;1255 const struct sched_class *sched_class;1256 struct sched_entity se;1257 struct sched_rt_entity rt;1258#ifdef CONFIG_CGROUP_SCHED1259 struct task_group *sched_task_group;1260#endif1261 struct sched_dl_entity dl;12621263#ifdef CONFIG_PREEMPT_NOTIFIERS1264 /* list of struct preempt_notifier: */1265 struct hlist_head preempt_notifiers;1266#endif12671268#ifdef CONFIG_BLK_DEV_IO_TRACE1269 unsigned int btrace_seq;1270#endif12711272 unsigned int policy;1273 int nr_cpus_allowed;1274 cpumask_t cpus_allowed;12751276#ifdef CONFIG_PREEMPT_RCU1277 int rcu_read_lock_nesting;1278 union rcu_special rcu_read_unlock_special;1279 struct list_head rcu_node_entry;1280#endif /* #ifdef CONFIG_PREEMPT_RCU */1281#ifdef CONFIG_TREE_PREEMPT_RCU1282 struct rcu_node *rcu_blocked_node;1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */1284#ifdef CONFIG_TASKS_RCU1285 unsigned long rcu_tasks_nvcsw;1286 bool rcu_tasks_holdout;1287 struct list_head rcu_tasks_holdout_list;1288 int rcu_tasks_idle_cpu;1289#endif /* #ifdef CONFIG_TASKS_RCU */12901291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)1292 struct sched_info sched_info;1293#endif12941295 struct list_head tasks;1296#ifdef CONFIG_SMP1297 struct plist_node pushable_tasks;1298 struct rb_node pushable_dl_tasks;1299#endif13001301 struct mm_struct *mm, *active_mm;1302#ifdef CONFIG_COMPAT_BRK1303 unsigned brk_randomized:1;1304#endif1305 /* per-thread vma caching */1306 u32 vmacache_seqnum;1307 struct vm_area_struct *vmacache[VMACACHE_SIZE];1308#if defined(SPLIT_RSS_COUNTING)1309 struct task_rss_stat rss_stat;1310#endif1311/* task state */1312 int exit_state;1313 int exit_code, exit_signal;1314 int pdeath_signal; /* The signal sent when the parent dies */1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */13161317 /* Used for emulating ABI behavior of previous Linux versions */1318 unsigned int personality;13191320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an1321 * execve */1322 unsigned in_iowait:1;13231324 /* Revert to default priority/policy when forking */1325 unsigned sched_reset_on_fork:1;1326 unsigned sched_contributes_to_load:1;13271328 unsigned long atomic_flags; /* Flags needing atomic access. */13291330 pid_t pid;1331 pid_t tgid;13321333#ifdef CONFIG_CC_STACKPROTECTOR1334 /* Canary value for the -fstack-protector gcc feature */1335 unsigned long stack_canary;1336#endif1337 /*1338 * pointers to (original) parent process, youngest child, younger sibling,1339 * older sibling, respectively. (p->father can be replaced with1340 * p->real_parent->pid)1341 */1342 struct task_struct __rcu *real_parent; /* real parent process */1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */1344 /*1345 * children/sibling forms the list of my natural children1346 */1347 struct list_head children; /* list of my children */1348 struct list_head sibling; /* linkage in my parent's children list */1349 struct task_struct *group_leader; /* threadgroup leader */13501351 /*1352 * ptraced is the list of tasks this task is using ptrace on.1353 * This includes both natural children and PTRACE_ATTACH targets.1354 * p->ptrace_entry is p's link on the p->parent->ptraced list.1355 */1356 struct list_head ptraced;1357 struct list_head ptrace_entry;13581359 /* PID/PID hash table linkage. */1360 struct pid_link pids[PIDTYPE_MAX];1361 struct list_head thread_group;1362 struct list_head thread_node;13631364 struct completion *vfork_done; /* for vfork() */1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */13671368 cputime_t utime, stime, utimescaled, stimescaled;1369 cputime_t gtime;1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE1371 struct cputime prev_cputime;1372#endif1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN1374 seqlock_t vtime_seqlock;1375 unsigned long long vtime_snap;1376 enum {1377 VTIME_SLEEPING = 0,1378 VTIME_USER,1379 VTIME_SYS,1380 } vtime_snap_whence;1381#endif1382 unsigned long nvcsw, nivcsw; /* context switch counts */1383 u64 start_time; /* monotonic time in nsec */1384 u64 real_start_time; /* boot based time in nsec */1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */1386 unsigned long min_flt, maj_flt;13871388 struct task_cputime cputime_expires;1389 struct list_head cpu_timers[3];13901391/* process credentials */1392 const struct cred __rcu *real_cred; /* objective and real subjective task1393 * credentials (COW) */1394 const struct cred __rcu *cred; /* effective (overridable) subjective task1395 * credentials (COW) */1396 char comm[TASK_COMM_LEN]; /* executable name excluding path1397 - access with [gs]et_task_comm (which lock1398 it with task_lock())1399 - initialized normally by setup_new_exec */1400/* file system info */1401 int link_count, total_link_count;1402#ifdef CONFIG_SYSVIPC1403/* ipc stuff */1404 struct sysv_sem sysvsem;1405 struct sysv_shm sysvshm;1406#endif1407#ifdef CONFIG_DETECT_HUNG_TASK1408/* hung task detection */1409 unsigned long last_switch_count;1410#endif1411/* CPU-specific state of this task */1412 struct thread_struct thread;1413/* filesystem information */1414 struct fs_struct *fs;1415/* open file information */1416 struct files_struct *files;1417/* namespaces */1418 struct nsproxy *nsproxy;1419/* signal handlers */1420 struct signal_struct *signal;1421 struct sighand_struct *sighand;14221423 sigset_t blocked, real_blocked;1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */1425 struct sigpending pending;14261427 unsigned long sas_ss_sp;1428 size_t sas_ss_size;1429 int (*notifier)(void *priv);1430 void *notifier_data;1431 sigset_t *notifier_mask;1432 struct callback_head *task_works;14331434 struct audit_context *audit_context;1435#ifdef CONFIG_AUDITSYSCALL1436 kuid_t loginuid;1437 unsigned int sessionid;1438#endif1439 struct seccomp seccomp;14401441/* Thread group tracking */1442 u32 parent_exec_id;1443 u32 self_exec_id;1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,1445 * mempolicy */1446 spinlock_t alloc_lock;14471448 /* Protection of the PI data structures: */1449 raw_spinlock_t pi_lock;14501451#ifdef CONFIG_RT_MUTEXES1452 /* PI waiters blocked on a rt_mutex held by this task */1453 struct rb_root pi_waiters;1454 struct rb_node *pi_waiters_leftmost;1455 /* Deadlock detection and priority inheritance handling */1456 struct rt_mutex_waiter *pi_blocked_on;1457#endif14581459#ifdef CONFIG_DEBUG_MUTEXES1460 /* mutex deadlock detection */1461 struct mutex_waiter *blocked_on;1462#endif1463#ifdef CONFIG_TRACE_IRQFLAGS1464 unsigned int irq_events;1465 unsigned long hardirq_enable_ip;1466 unsigned long hardirq_disable_ip;1467 unsigned int hardirq_enable_event;1468 unsigned int hardirq_disable_event;1469 int hardirqs_enabled;1470 int hardirq_context;1471 unsigned long softirq_disable_ip;1472 unsigned long softirq_enable_ip;1473 unsigned int softirq_disable_event;1474 unsigned int softirq_enable_event;1475 int softirqs_enabled;1476 int softirq_context;1477#endif1478#ifdef CONFIG_LOCKDEP1479# define MAX_LOCK_DEPTH 48UL1480 u64 curr_chain_key;1481 int lockdep_depth;1482 unsigned int lockdep_recursion;1483 struct held_lock held_locks[MAX_LOCK_DEPTH];1484 gfp_t lockdep_reclaim_gfp;1485#endif14861487/* journalling filesystem info */1488 void *journal_info;14891490/* stacked block device info */1491 struct bio_list *bio_list;14921493#ifdef CONFIG_BLOCK1494/* stack plugging */1495 struct blk_plug *plug;1496#endif14971498/* VM state */1499 struct reclaim_state *reclaim_state;15001501 struct backing_dev_info *backing_dev_info;15021503 struct io_context *io_context;15041505 unsigned long ptrace_message;1506 siginfo_t *last_siginfo; /* For ptrace use. */1507 struct task_io_accounting ioac;1508#if defined(CONFIG_TASK_XACCT)1509 u64 acct_rss_mem1; /* accumulated rss usage */1510 u64 acct_vm_mem1; /* accumulated virtual memory usage */1511 cputime_t acct_timexpd; /* stime + utime since last update */1512#endif1513#ifdef CONFIG_CPUSETS1514 nodemask_t mems_allowed; /* Protected by alloc_lock */1515 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */1516 int cpuset_mem_spread_rotor;1517 int cpuset_slab_spread_rotor;1518#endif1519#ifdef CONFIG_CGROUPS1520 /* Control Group info protected by css_set_lock */1521 struct css_set __rcu *cgroups;1522 /* cg_list protected by css_set_lock and tsk->alloc_lock */1523 struct list_head cg_list;1524#endif1525#ifdef CONFIG_FUTEX1526 struct robust_list_head __user *robust_list;1527#ifdef CONFIG_COMPAT1528 struct compat_robust_list_head __user *compat_robust_list;1529#endif1530 struct list_head pi_state_list;1531 struct futex_pi_state *pi_state_cache;1532#endif1533#ifdef CONFIG_PERF_EVENTS1534 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];1535 struct mutex perf_event_mutex;1536 struct list_head perf_event_list;1537#endif1538#ifdef CONFIG_DEBUG_PREEMPT1539 unsigned long preempt_disable_ip;1540#endif1541#ifdef CONFIG_NUMA1542 struct mempolicy *mempolicy; /* Protected by alloc_lock */1543 short il_next;1544 short pref_node_fork;1545#endif1546#ifdef CONFIG_NUMA_BALANCING1547 int numa_scan_seq;1548 unsigned int numa_scan_period;1549 unsigned int numa_scan_period_max;1550 int numa_preferred_nid;1551 unsigned long numa_migrate_retry;1552 u64 node_stamp; /* migration stamp */1553 u64 last_task_numa_placement;1554 u64 last_sum_exec_runtime;1555 struct callback_head numa_work;15561557 struct list_head numa_entry;1558 struct numa_group *numa_group;15591560 /*1561 * Exponential decaying average of faults on a per-node basis.1562 * Scheduling placement decisions are made based on the these counts.1563 * The values remain static for the duration of a PTE scan1564 */1565 unsigned long *numa_faults_memory;1566 unsigned long total_numa_faults;15671568 /*1569 * numa_faults_buffer records faults per node during the current1570 * scan window. When the scan completes, the counts in1571 * numa_faults_memory decay and these values are copied.1572 */1573 unsigned long *numa_faults_buffer_memory;15741575 /*1576 * Track the nodes the process was running on when a NUMA hinting1577 * fault was incurred.1578 */1579 unsigned long *numa_faults_cpu;1580 unsigned long *numa_faults_buffer_cpu;15811582 /*1583 * numa_faults_locality tracks if faults recorded during the last1584 * scan window were remote/local. The task scan period is adapted1585 * based on the locality of the faults with different weights1586 * depending on whether they were shared or private faults1587 */1588 unsigned long numa_faults_locality[2];15891590 unsigned long numa_pages_migrated;1591#endif /* CONFIG_NUMA_BALANCING */15921593 struct rcu_head rcu;15941595 /*1596 * cache last used pipe for splice1597 */1598 struct pipe_inode_info *splice_pipe;15991600 struct page_frag task_frag;16011602#ifdef CONFIG_TASK_DELAY_ACCT1603 struct task_delay_info *delays;1604#endif1605#ifdef CONFIG_FAULT_INJECTION1606 int make_it_fail;1607#endif1608 /*1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call1610 * balance_dirty_pages() for some dirty throttling pause1611 */1612 int nr_dirtied;1613 int nr_dirtied_pause;1614 unsigned long dirty_paused_when; /* start of a write-and-pause period */16151616#ifdef CONFIG_LATENCYTOP1617 int latency_record_count;1618 struct latency_record latency_record[LT_SAVECOUNT];1619#endif1620 /*1621 * time slack values; these are used to round up poll() and1622 * select() etc timeout values. These are in nanoseconds.1623 */1624 unsigned long timer_slack_ns;1625 unsigned long default_timer_slack_ns;16261627#ifdef CONFIG_FUNCTION_GRAPH_TRACER1628 /* Index of current stored address in ret_stack */1629 int curr_ret_stack;1630 /* Stack of return addresses for return function tracing */1631 struct ftrace_ret_stack *ret_stack;1632 /* time stamp for last schedule */1633 unsigned long long ftrace_timestamp;1634 /*1635 * Number of functions that haven't been traced1636 * because of depth overrun.1637 */1638 atomic_t trace_overrun;1639 /* Pause for the tracing */1640 atomic_t tracing_graph_pause;1641#endif1642#ifdef CONFIG_TRACING1643 /* state flags for use by tracers */1644 unsigned long trace;1645 /* bitmask and counter of trace recursion */1646 unsigned long trace_recursion;1647#endif /* CONFIG_TRACING */1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */1649 unsigned int memcg_kmem_skip_account;1650 struct memcg_oom_info {1651 struct mem_cgroup *memcg;1652 gfp_t gfp_mask;1653 int order;1654 unsigned int may_oom:1;1655 } memcg_oom;1656#endif1657#ifdef CONFIG_UPROBES1658 struct uprobe_task *utask;1659#endif1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)1661 unsigned int sequential_io;1662 unsigned int sequential_io_avg;1663#endif1664};
关于task_struct的具体介绍,见
它定义在linux-3.18.6/include/linux/sched.h文件中。
进程(Process)是系统进行资源分配和调度的基本单位,一个进程是一个程序的运行实例。而在Linux中,可以使用一个进程来创建另外一个进程。这样的话,Linux的进程的组织结
构其实有点像Linux目录树,是个层次结构的,可以使用 pstree命令来查看。在最上面是init程序的执行进程。它是所有进程的老祖宗。Linux提供了两个函数来创建进程。
1.fork()
fork()提供了创建进程的基本操作,可以说它是Linux系统多任务的基础。该函数在///。
2.exec系列函数
如果只有fork(),肯定是不完美的,因为fork()只能参数一个父进程的副本。而exec系列函数则可以帮助我们建立一个全新的新进程。
在Linux系统中,一个进程的PCB是一个C语言的结构体task_struct来表示,而多个PCB之间是由一个双向链表组织起来的,在《Understanding the Linux Kernel》中,则是进一步描
述这个链表是一个双向循环链表。
在Linux中创建一个新进程的方法是使用fork函数,fork()执行一次但有两个返回值。
在父进程中,返回值是子进程的进程号;在子进程中,返回值为0。因此可通过返回值来判断当前进程是父进程还是子进程。
使用fork函数得到的子进程是父进程的一个复制品,它从父进程处复制了整个进程的地址空间,包括进程上下文,进程堆栈,内存信息,打开的文件描述符,信 号控制设定,进程优
先级,进程组号,当前工作目录,根目录,资源限制,控制终端等。而子进程所独有的只是它的进程号,资源使用和计时器等。可以看出,使用 fork函数的代价是很大的,它复制了
父进程中的代码段,数据段和堆栈段里的大部分内容,使得fork函数的执行速度并不快。
创建一个进程,至少涉及的函数:
sys_clone, do_fork, dup_task_struct, copy_process, copy_thread, ret_from_fork

这只是图中的fork一个分支
学习笔记
进程的描述
1.进程描述符task_struct数据结构(一)
为了管理进程,内核必须对每个进程进行清晰的描述,进程描述符提供了内核所需了解的进程信息。
- struct task_struct数据结构很庞大
- Linux进程的状态与操作系统原理中的描述的进程状态似乎有所不同,比如就绪状态和运行状态都是TASK_RUNNING,为什么呢?
- 进程的标示pid
- 所有进程链表struct list_head tasks; 内核的双向循环链表的实现方法 - 一个更简略的双向循环链表
- 程序创建的进程具有父子关系,在编程时往往需要引用这样的父子关系。进程描述符中有几个域用来表示这样的关系
- Linux为每个进程分配一个8KB大小的内存区域,用于存放该进程两个不同的数据结构:Thread_info和进程的内核堆栈
进程处于内核态时使用,不同于用户态堆栈,即PCB中指定了内核栈,那为什么PCB中没有用户态堆栈?用户态堆栈是怎么设定的?
内核控制路径所用的堆栈很少,因此对栈和Thread_info来说,8KB足够了
- struct thread_struct thread; //CPU-specific state of this task
- 文件系统和文件描述符
- 内存管理——进程的地址空间
进程状态的切换过程和原因大致如下图:
双向循环链表图如下:
进程的父子关系直观图:
进程的创建
1.进程的创建概览及fork一个进程的用户态代码
(1)进程的起源再回顾
- 道生一(start_kernel...cpu_idle)
- 一生二(kernel_init和kthreadd)
- 二生三(即前面的0、1、2三个进程)
- 三生万物(1号进程是所有用户态进程的祖先,2号进程是所有内核线程的祖先)
(2)0号进程手工写,1号进程复制、加载init程序
(3)shell命令行是如何启动进程的
fork一个子进程的代码:


1 #include2 #include 3 #include 4 int main(int argc, char * argv[]) 5 { 6 int pid; 7 /* fork another process */ 8 pid = fork(); 9 if (pid < 0) 出错处理10 { 11 /* error occurred */12 fprintf(stderr,"Fork Failed!");13 exit(-1);14 } 15 else if (pid == 0) 16 {17 /* child process */ 子进程 pid=0时 if和else都会执行 fork系统调用在父进程和子进程各返回一次18 printf("This is Child Process!\n");19 } 20 else 21 { 22 /* parent process */23 printf("This is Parent Process!\n");24 /* parent will wait for the child to complete*/25 wait(NULL);26 printf("Child Complete!\n");27 }28 }
2.理解进程创建过程复杂代码的方法
(1)系统调用再回顾
(2)fork的子进程是从哪里开始执行的?
与基于mykernel写的精简内核对照起来。
(3)创建一个新进程在内核中的执行过程
- fork、vfork和clone三个系统调用都可以创建一个新进程,而且都是通过调用do_fork来实现进程的创建;
- Linux通过复制父进程来创建一个新进程,那么这就给我们理解这一个过程提供一个想象的框架:
- 复制一个PCB——task_struct
err = arch_dup_task_struct(tsk, orig);
- 要给新进程分配一个新的内核堆栈
ti = alloc_thread_info_node(tsk, node);tsk->stack = ti;setup_thread_stack(tsk, orig); //这里只是复制thread_info,而非复制内核堆栈
- 要修改复制过来的进程数据,比如pid、进程链表等等都要改改吧,见copy_process内部。
- 从用户态的代码看fork();函数返回了两次,即在父子进程中各返回一次,父进程从系统调用中返回比较容易理解,子进程从系统调用中返回,那它 在系统调用处理过程中的哪里开始执行的呢?这就涉及子进程的内核堆栈数据状态和task_struct中thread记录的sp和ip的一致性问题,这是 在哪里设定的?copy_thread in copy_process
1 *childregs = *current_pt_regs(); //复制内核堆栈2 childregs->ax = 0; //为什么子进程的fork返回0,这里就是原因!3 4 p->thread.sp = (unsigned long) childregs; //调度到子进程时的内核栈顶5 p->thread.ip = (unsigned long) ret_from_fork; //调度到子进程时的第一条指令地址
(4)理解复杂事物要预设一个大致的框架。
(5)创建新进程是通过复制当前进程来实现的。
(6)设想创建新进程过程中需要做哪些事
3.浏览进程创建过程相关的关键代码
(1)系统调用内核处理函数sys_fork、sys_clone、sys_vfork
最终都是执行do_fork()。
do_fork()里的复制进程的函数:
具体:
打开复制PCB的具体函数:
打开alloc_thread_info():
拷贝内核堆栈数据和指定新进程的第一条指令地址。
4.创建的新进程是从哪里开始执行的?
(1)复制内核堆栈时
打开pt_regs:
int指令和SAVE_ALL压到内核栈的内容。
下面分析entry_32.S,也就是总控程序。
5.使用gdb跟踪创建新进程的过程(见作业)
实验:
1、流程
添加fork()到MenuOS
编译并启动MenuOS
用GDB连接,添加breakpoints,
根据观察copy_process是建立新进程,
weak_up_new_task则是运行这个新进程,所以要尝试添加这样一个断点
breakpoints list:b sys_clone
b sys_clone b do_fork
b copy_process
b dup_task_struct
b alloc_task_struct_node
b arch_dup_task_struct
b copy_thread
b ret_from_fork
b wake_up_new_task
跟踪fork执行
2、实验记录
2.1 添加并验证fork()可用
2.2 跟踪fork
四、总结
Fork创建的新进程是和父进程(除了PID和PPID)一样的副本,包括真实和有效的UID和GID、进程组合会话ID、环境、资源限制、打开的文件以及共享内存段。
根据代码的分析,do_fork中,copy_process管子进程运行的准备,wake_up_new_task作为子进程forking的完成。