Linux Kernel 02 - System Calls
# Prerequisites
# Preface
# Implementation Overview
# Syscall Handling
For x86-64, when a system call arises, the execution will and only will enter entry_SYSCALL_64
, then :
First,
entry_SYSCALL_64
will calldo_syscall_64
.Then,
do_syscall_64
will calldo_syscall_x64
.Finally,
do_syscall_x64
will call the corresponding function (i.e.__x64_sys_syscallname
) according to the mapping arraysys_call_table
and the syscall id passed by the caller in user level.
entry_SYSCALL_64
is defined in arch/x86/entry/entry_64.S
:
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_ENTRY
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
X86_FEATURE_XENPV
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
jne swapgs_restore_regs_and_return_to_usermode
do_syscall_64
and do_syscall_x64
are defined in arch/x86/entry/common.c
:
__visible noinstr void do_syscall_64(struct pt_regs * regs, int nr) {
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
/* Invalid system call, but still a system call. */
regs->ax = __x64_sys_ni_syscall(regs);
}
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
static __always_inline bool do_syscall_x64(struct pt_regs * regs, int nr) {
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons.
*/
unsigned int unr = nr;
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = sys_call_table[unr](regs);
return true;
}
return false;
}
# Syscall Definition
A system call in linux kernel is defined through SYSCALL_DEFINEn
macros, where n
indicates the number of parameters, ranging from to . Take sys_write
(defined in fs/read_write.c
) as an example :
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) {
return ksys_write(fd, buf, count);
}
ssize_t ksys_write(unsigned int fd, const char __user * buf, size_t count) {
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
// ... irrelevant contents omitted
}
return ret;
}
We omit implementation details of SYSCALL_DEFINEn
for brevity and move them to an independent optional character (see Implementation Details : Syscall Definition). Finally, the definition of sys_write
will be expanded to :
static long __se_sys_write(long fd, long long buf, long count);
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count);
long __x64_sys_write(const struct pt_regs * regs);
long __x64_sys_write(const struct pt_regs * regs)
{
return __se_sys_write(regs->di, regs->si, regs->dx);
}
static long __se_sys_write(long fd, long long buf, long count)
{
long ret = __do_sys_write((__force unsigned int) fd, (__force const char __user *) buf, (__force size_t) count);
return ret;
}
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count) {
return ksys_write(fd, buf, count);
}
ssize_t ksys_write(unsigned int fd, const char __user * buf, size_t count) {
// ...
}
For better understanding, see the calling stack example when the kernel execution reaches ksys_write
(e.g. the functional codes) :
(gdb) bt
#0 ksys_write (fd=1, buf=0x862ce0 "\nhello, busybox!\n\n", count=18) at fs/read_write.c:638
#1 __do_sys_write (count=18, buf=0x862ce0 "\nhello, busybox!\n\n", fd=1) at fs/read_write.c:659
#2 __se_sys_write (count=18, buf=8793312, fd=1) at fs/read_write.c:656
#3 __x64_sys_write (regs=0xffffc9000000bf58) at fs/read_write.c:656
#4 0xffffffff811383eb in do_syscall_x64 (nr=<optimized out>, regs=0xffffc9000000bf58) at arch/x86/entry/common.c:50
#5 do_syscall_64 (regs=0xffffc9000000bf58, nr=<optimized out>) at arch/x86/entry/common.c:80
#6 0xffffffff81200065 in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:118
#7 0x0000000000000000 in ?? ()
# Syscall Table
The syscall table is defined in arch/x86/entry/syscall_64.c
:
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) __x64_##sym,
asmlinkage const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
Element type sys_call_ptr_t
represents a pointer to a system call table. It is defined as typedef
in asm/syscall.h
which indicates arch/x86/include/asm/syscall.h
:
typedef long (* sys_call_ptr_t)(const struct pt_regs *);
- Parameter type
struct pt_regs
is used for ptrace to save register context if necessary (seearch/x86/include/asm/ptrace.h
) .
asm/syscalls_64.h
indicates arch/x86/include/generated/asm/syscalls_64.h
, which is generated through the script tools/perf/arch/x86/entry/syscalls/syscalltbl.sh
:
__SYSCALL(0, sys_read)
__SYSCALL(1, sys_write)
...
__SYSCALL(335, sys_ni_syscall)
...
sys_ni_syscall
represents not-implemented system calls, which is simply-implemented in kernel/sys_ni.c
:
asmlinkage long sys_ni_syscall(void) {
return -ENOSYS;
}
The -ENOSYS
error indicates not-implemented function under the POSIX standard.
arch/x86/entry/syscalls/syscall_64.tbl
declares 64-bit system call numbers and entry vectors :
0 common read sys_read
1 common write sys_write
...
After macro expandation of __SYSCALL
, the aforementioned file syscall_64.c
takes the following form :
extern long __x64_sys_read(const struct pt_regs *);
extern long __x64_sys_write(const struct pt_regs *);
...
extern long __x64_sys_ni_syscall(const struct pt_regs *);
...
asmlinkage const sys_call_ptr_t sys_call_table[] = {
__x64_sys_read,
__x64_sys_write,
...
__x64_sys_ni_syscall,
...
};
Finally, when a system call arises, the sys_call_table
is used (by aforementioned do_syscall_x64
) to execute the corresponding function.
# Implementation Details (Optional)
# Syscall Definition
The macros SYSCALL_DEFINEn
are defined in include/linux/syscalls.h
:
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE_MAXARGS 6
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
The macro SYSCALL_METADATA
is expanded only if the kernel config option CONFIG_FTRACE_SYSCALLS
is enabled. Now just ignore it.
Note that SYSCALL_DEFINE0
and __SYSCALL_DEFINEx
is special. As the early lines in include/linux/syscalls.h
say :
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
/*
* It may be useful for an architecture to override the definitions of the
* SYSCALL_DEFINE0() and __SYSCALL_DEFINEx() macros, in particular to use a
* different calling convention for syscalls. To allow for that, the prototypes
* for the sys_*() functions below will *not* be included if
* CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
*/
#include <asm/syscall_wrapper.h>
#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
If the kernel config option CONFIG_ARCH_HAS_SYSCALL_WRAPPER
is enabled (answered YES for default tinyconfig
we used), we can find the definition of SYSCALL_DEFINE0
and __SYSCALL_DEFINEx
in asm/syscall_wrapper.h
(which indicates arch/x86/include/asm/syscall_wrapper.h
). Otherwise, they are defined in include/linux/syscalls.h
generically. For brevity, their generic definitions are omitted.
# SYSCALL_DEFINE0
The macro SYSCALL_DEFINE0
is defined in asm/syscall_wrapper.h
(for default tinyconfig
we used) :
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
static long __do_sys_##sname(const struct pt_regs *__unused); \
__X64_SYS_STUB0(sname) \
__IA32_SYS_STUB0(sname) \
static long __do_sys_##sname(const struct pt_regs *__unused)
SYSCALL_DEFINE0
is easy to understand (it doesn't have terrible parameter expansion). Take sys_getpid
as example :
SYSCALL_DEFINE0(getpid) {
return task_tgid_vnr(current);
}
The macro __X64_SYS_STUB0
is defined in asm/syscall_wrapper.h
:
#define __X64_SYS_STUB0(name) \
__SYS_STUB0(x64, sys_##name)
#define __SYS_STUB0(abi, name) \
long __##abi##_##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
long __##abi##_##name(const struct pt_regs *regs) \
__alias(__do_##name);
The macro ALLOW_ERROR_INJECTION
is expanded only if the kernel config option CONFIG_FUNCTION_ERROR_INJECTION
is enabled. Now just ignore it.
It will be expanded as follows :
__X64_SYS_STUB0(getpid)
__SYS_STUB0(x64, sys_getpid)
long __x64_sys_getpid(const struct pt_regs * regs);
long __x64_sys_getpid(const struct pt_regs * regs) __alias(__do_sys_getpid);
Note that the macro __X64_SYS_STUB0
defines __x64_sys_getpid
as the alias of __do_sys_getpid
. This is quite different from __X64_SYS_STUBx
in SYSCALL_DEFINEx
, where __x64_sys_xxx
calls __se_sys_xxx
and then __se_sys_xxx
calls __do_sys_xxx
.
Finally, the definition of sys_getpid
will be expanded to :
static long __do_sys_getpid(const struct pt_regs *__unused);
__X64_SYS_STUB0(getpid)
__IA32_SYS_STUB0(getpid)
static long __do_sys_getpid(const struct pt_regs *__unused) {
return task_tgid_vnr(current);
}
# SYSCALL_DEFINEn
The macro __SYSCALL_DEFINEx
is defined in asm/syscall_wrapper.h
(for default tinyconfig
we used) :
#define __SYSCALL_DEFINEx(x, name, ...) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
__X64_SYS_STUBx(x, name, __VA_ARGS__) \
__IA32_SYS_STUBx(x, name, __VA_ARGS__) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
Now let's go through the macro expansion of __SYSCALL_DEFINEx
.
__MAP and __SC_xxx
The __MAP
and __SC_xxx
macros defined in include/linux/syscalls.h
are used for parameter type expansion :
/*
* __MAP - apply a macro to syscall arguments
* __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
* m(t1, a1), m(t2, a2), ..., m(tn, an)
* The first argument must be equal to the amount of type/name
* pairs given. Note that this list of pairs (i.e. the arguments
* of __MAP starting at the third one) is in the same format as
* for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
*/
#define __MAP0(m, ...)
#define __MAP1(m, t, a, ...) m(t, a)
#define __MAP2(m, t, a, ...) m(t, a), __MAP1(m, __VA_ARGS__)
#define __MAP3(m, t, a, ...) m(t, a), __MAP2(m, __VA_ARGS__)
#define __MAP4(m, t, a, ...) m(t, a), __MAP3(m, __VA_ARGS__)
#define __MAP5(m, t, a, ...) m(t, a), __MAP4(m, __VA_ARGS__)
#define __MAP6(m, t, a, ...) m(t, a), __MAP5(m, __VA_ARGS__)
#define __MAP(n, ...) __MAP##n(__VA_ARGS__)
#define __SC_DECL(t, a) t a
#define __TYPE_AS(t, v) __same_type((__force t) 0, v)
#define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))
#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
#define __SC_CAST(t, a) (__force t) a
#define __SC_ARGS(t, a) a
#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
The macro __same_type
used above is defined in include/linux/compiler_types.h
:
/* Are two types/vars the same type (ignoring qualifiers)? */
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
The macro __force
used above is only used for Sparse tools. It has no effect on normally-used kernel. Now just ignore it.
The macro __builtin_choose_expr
used above is a GCC builtin extension, which is similar to (but different with) the ? :
operator. For more details of __builtin_choose_expr
, see GCC Documentation - Other Builtins (opens new window).
The macro BUILD_BUG_ON_ZERO
used above is defined in include/linux/build_bug.h
:
#ifdef __CHECKER__
#define BUILD_BUG_ON_ZERO(e) (0)
#else /* __CHECKER__ */
/*
* Force a compilation error if condition is true, but also produce a
* result (of value 0 and type int), so the expression can be used
* e.g. in a structure initializer (or where-ever else comma expressions
* aren't permitted).
*/
#define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); })))
#endif /* __CHECKER__ */
__X64_SYS_STUBx
The macro __X64_SYS_STUBx
(similar to __X64_SYS_STUBx
but more complicated) is defined in asm/syscall_wrapper.h
:
#define __X64_SYS_STUBx(x, name, ...) \
__SYS_STUBx(x64, sys##name, \
SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
#define __SYS_STUBx(abi, name, ...) \
long __##abi##_##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
long __##abi##_##name(const struct pt_regs *regs) \
{ \
return __se_##name(__VA_ARGS__); \
}
/* Mapping of registers to parameters for syscalls on x86-64 and x32 */
#define SC_X86_64_REGS_TO_ARGS(x, ...) \
__MAP(x,__SC_ARGS \
,,regs->di,,regs->si,,regs->dx \
,,regs->r10,,regs->r8,,regs->r9) \
Expansion Example
The definition of sys_write
is expanded as follows :
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count)
SYSCALL_DEFINEx(3, _write, unsigned int, fd, const char __user *, buf, size_t, count)
__SYSCALL_DEFINEx(3, _write, unsigned int, fd, const char __user *, buf, size_t, count)
static long __se_sys_write(__MAP(3, __SC_LONG, unsigned int, fd, const char __user *, buf, size_t, count));
static long __se_sys_write(__SC_LONG(unsigned int, fd), __SC_LONG(const char __user *, buf), __SC_LONG(size_t, count));
__SC_LONG(unsigned int, fd)
__typeof(__builtin_choose_expr(__TYPE_IS_LL(unsigned int), 0LL, 0L)) fd
__typeof(__builtin_choose_expr((__TYPE_AS(unsigned int, 0LL) || __TYPE_AS(unsigned int, 0ULL)), 0LL, 0L)) fd
__typeof(__builtin_choose_expr(false, 0LL, 0L)) fd
__typeof(0L) fd
long fd
static long __se_sys_write(long fd, long long buf, long count);
static inline long __do_sys_write(__MAP(3, __SC_DECL, unsigned int, fd, const char __user *, buf, size_t, count));
static inline long __do_sys_write(__SC_DECL(unsigned int, fd), __SC_DECL(const char __user *, buf), __SC_DECL(size_t, count));
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count);
__X64_SYS_STUBx(3, write, unsigned int, fd, const char __user *, buf, size_t, count)
__SYS_STUBx(x64, sys_write, SC_X86_64_REGS_TO_ARGS(3, unsigned int, fd, const char __user *, buf, size_t, count))
__SYS_STUBx(x64, sys_write, __MAP(3, __SC_ARGS, , regs->di, , regs->si, , regs->dx, , regs->r10, , regs->r8, , regs->r9))
__SYS_STUBx(x64, sys_write, regs->di, regs->si, regs->dx)
long __x64_sys_write(const struct pt_regs * regs);
long __x64_sys_write(const struct pt_regs * regs)
{
return __se_sys_write(regs->di, regs->si, regs->dx);
}
static long __se_sys_write(long fd, long long buf, long count)
{
long ret = __do_sys_write((__force unsigned int) fd, (__force const char __user *) buf, (__force size_t) count);
__MAP(x, __SC_TEST, __VA_ARGS__); // we simply ignore it because we assume the tests are passed
__PROTECT(x, ret, __MAP(x, __SC_ARGS, __VA_ARGS__)); // we simply ignore it because x86_64 doesn't use it
return ret;
}
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count) {
return ksys_write(fd, buf, count);
}
ssize_t ksys_write(unsigned int fd, const char __user * buf, size_t count) {
// ...
}
See Implementation Overview : Syscall Definition) for the expansion result of sys_write
.
# Add a New System Call
For beginners who only aim to make the added system call (e.g. sys_hello
) callable as quickly as possible, it is not necessary to follow the linux kernel documentation step-by-step (see References) . Here we provide the least steps required to add a new system call.
# Use a New Source Directory (Optional)
As mentioned above, we need to implement the new system call in the form of SYSCALL_DEFINEn(name)
somewhere in the kernel source code. The easiest way is to arbitrarily implement it anywhere (e.g. in fs/read_write.c
together with syscall #0 sys_read
). However, using a new source directory is a more elegant option.
Assume that we create a new source directory named custom
. One of the easiest way to make the kernel compile this directory is as follows.
Step 1. Modify line 663 of Makefile
(for linux-5.15.57) :
core-y := init/ usr/ arch/$(SRCARCH)/ custom/
Or add anywhere after line 663 :
core-y += custom/
Or any other similar ways, anyway.
Step 2. Assign the list of object files to obj-y
in custom/Makefile
. For example :
obj-y := hello.o world.o
# Add Syscall Definition
Add the definition of syscall anywhere, for example :
SYSCALL_DEFINE2(hello, int, x, int, y) {
return x + y;
}
# Config Syscall Table
Add a syscall entry in arch/x86/entry/syscalls/syscall_64.tbl
:
0 common read sys_read
1 common write sys_write
...
500 common hello sys_hello
...
# Add Syscall Function Prototype (Not Required by Default)
If the kernel config option CONFIG_ARCH_HAS_SYSCALL_WRAPPER
is disabled (different from default tinyconfig
we used), a corresponding function prototype should be added to include/linux/syscalls.h
:
#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
...
asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count);
...
asmlinkage long sys_hello(int x);
...
#endif
# Execute the New Syscall
For brievity, We can simply write a hello-world program, compile it statically, and add it into initramfs :
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#define SYS_hello 500
int main(void) {
printf("hello, world!\n");
int ret = syscall(SYS_hello, 12, 24);
printf("syscall #500 ret = %d\n", ret);
fflush(stdout);
return 0;
}
The output of the above program is as follows :
hello, world!
syscall #500 ret = 36
# References
Linux Kernel Documentation - Adding a New System Call (opens new window)
linux-insides - System calls in the Linux kernel (opens new window)
# Appendix
# Difference to Older Kernel Version
In older versions of linux kernel (e.g. 5.0.x), the syscall table is implemented in the form of designated initializers (e.g. [0] = sys_read, [1] = sys_write, ...). In 5.15.57, unused syscall entries is pre-filled with sys_ni_syscall
in the generated file syscalls_64.h
, then filled into the definition of sys_call_table
.
In include/generated/asm-offsets.h
:
#define __NR_syscall_max 547
In arch/x86/entry/syscall_64.c
:
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
[0] = sys_read,
[1] = sys_write,
...
};
In arch/x86/include/asm/syscall.h
:
typedef void (* sys_call_ptr_t)(void);