Kernel seminar – part 3

Top and bottom halves

Top half: section that executed within the ISR
Bottom half: section that executed after the interrupt
Both implemented as tasklets
each tasklet executed on one CPU after the interrupt is handled. The tasklets to be excited stored in a linked list.

Declare a tasklet:

DECLARE_TASKLET (module_tasklet, //Name
    module_do_tasklet,          //Function
    tasklet_data);                       // tasklet argument

To schedule a tasklet (in the interrupt handler):

tasklet_schedule(&module_tasklet); // tasklets at the end of the linked list of tasklets
tasklet_hi_schedule(&module_tasklet); // start of the linked list of tasklets

tasklet example

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/interrupt.h>


char my_tasklet_data[]="my_tasklet_function was called";

/* Bottom Half Function */
void my_tasklet_function( unsigned long data )
  printk( "%s\n", (char *)data );

DECLARE_TASKLET( my_tasklet, my_tasklet_function,
                (unsigned long) &my_tasklet_data );

int init_module( void )
  /* Schedule the Bottom Half */
  tasklet_schedule( &my_tasklet );
  return 0;

void cleanup_module( void )
  /* Stop the tasklet before we exit */
  tasklet_kill( &my_tasklet );

Deferring work

Work queues

another mechanism for deferring work
works on the same CPU
bigger latency than worklets

Work functions are in workqueue.h
Schedule work dynamically:

static inline bool schedule_work(struct work_struct *work)
static inline bool schedule_work_on(int cpu, struct work_struct *work)
static inline bool schedule_delayed_work(struct delayed_work *dwork,unsigned long delay)
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,unsigned long delay)

Schedule work statically with macros

 INIT_WORK(_work, _func)
 INIT_DELAYED_WORK(_work, _func)

Work queue example:

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/workqueue.h>
#include <linux/slab.h>


static struct workqueue_struct *my_wq;

typedef struct {
  struct work_struct my_work;
  int x;
} my_work_t;
my_work_t *work, *work2;

static void my_wq_function( struct work_struct *work)
  my_work_t *my_work = (my_work_t *)work;
  printk( "my_work.x %d\n", my_work->x );
  kfree( (void *)work );

int init_module( void )
  int ret;
  my_wq = create_workqueue("my_queue");
  if (my_wq) {
    /* Queue some work (item 1) */
    work = (my_work_t *)kmalloc(sizeof(my_work_t), GFP_KERNEL);
    if (work) {
      INIT_WORK( (struct work_struct *)work, my_wq_function );
      work->x = 1;
      ret = queue_work( my_wq, (struct work_struct *)work );
    /* Queue some additional work (item 2) */
    work2 = (my_work_t *)kmalloc(sizeof(my_work_t), GFP_KERNEL);
    if (work2) {
      INIT_WORK( (struct work_struct *)work2, my_wq_function );
      work2->x = 2;
      ret = queue_work( my_wq, (struct work_struct *)work2 );
  return 0;

void cleanup_module( void )
  flush_workqueue( my_wq );
  destroy_workqueue( my_wq );

Time in the kernel


Timer interrupts occur every 1/HZ of a second (= 1 jiffy)
HZ is configurable (in ‘Processor type and features’):
* 100,250(i368 default), 300 or 1000 (other architechtures)
* see kernel/Kconfig.hz
Global variable jiffies represents the number of tick since machine started. It increments with each timer interrupt
Read jiffies with get_jiffies_64 function
Convert to msec with jiffies_to_msecs or to microsecs with jiffies_to_usecs
Requires #include <linux/jiffies.h>

The kernel provides asynchronous timers
Run in atomic kernel
require #include <linux/timer.h>
Timers API
static: TIMER_INITIALIZER(_function,_expires,_data)
void init_timer(struct timer_list *timer)
void setup_timer( struct timer_list *timer,void (*function)(unsigned long),unsigned long data)
add new timer:
void add_timer (struct timer_list *timer)
remove timer:
void del_timer (struct timer_list *timer)

Timer simple program example

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/timer.h>


static struct timer_list my_timer;

void my_timer_callback( unsigned long data )
  printk( "my_timer_callback called (%ld).\n", jiffies );

int init_module( void )
  int ret;

  printk("Timer module installing\n");

  // my_timer.function,
  setup_timer( &my_timer, my_timer_callback, 0 );

  printk( "Starting timer to fire in 200ms (%ld)\n", jiffies );
  ret = mod_timer( &my_timer, jiffies + msecs_to_jiffies(200) );
  if (ret) printk("Error in mod_timer\n");

  return 0;

void cleanup_module( void )
  int ret;

  ret = del_timer( &my_timer );
  if (ret) printk("The timer is still in use...\n");

  printk("Timer module uninstalling\n");


High resolution timers

Allow timers in resolution of nano seconds
High resolution timers there two time bases (in oppose to jiffies on normal timers):
1. CLOCK_REALTIME: jiffies, same as normal timers
2. CLOCK_MONOTONIC: wide clock measuring the time in seconds and nanoseconds since system boot. Cannot be modified, so can be used for accurate time measurement.

High resolution timers example

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/hrtimer.h>
#include <linux/ktime.h>


#define MS_TO_NS(x)     (x * 1E6L)

static struct hrtimer hr_timer;

enum hrtimer_restart my_hrtimer_callback( struct hrtimer *timer )
  ktime_t now;
  ktime_t delay = ktime_set(0, MS_TO_NS(200));
  printk( "my_hrtimer_callback called (%ld).\n", jiffies );
  now = ktime_get();
  hrtimer_forward(&hr_timer, now, delay);


int init_module( void )
  ktime_t ktime;
  unsigned long delay_in_ms = 200L;

  printk("HR Timer module installing\n");

  ktime = ktime_set( 0, MS_TO_NS(delay_in_ms) );

  hrtimer_init( &hr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL );

  hr_timer.function = &my_hrtimer_callback;

  printk( "Starting timer to fire in %ldms (%ld)\n", delay_in_ms, jiffies );

  hrtimer_start( &hr_timer, ktime, HRTIMER_MODE_REL );

  return 0;

void cleanup_module( void )
  int ret;

  ret = hrtimer_cancel( &hr_timer );
  if (ret) printk("The timer was still in use...\n");

  printk("HR Timer module uninstalling\n");


Kernel seminar – part 2

Memory allocation

There is no “malloc” here, this is not a user space. Here we use kmalloc.

void * kmalloc (size_t size, gfp_t flags);

kmalloc flags:
GFP_KERNEL – standard memory allocation. May block
GFP_ATOMIC – allocated RAM from interrupt handlers or code not triggered by user processes. Never blocks. Atomic memory pool is limited.

The memory that kmalloc allocates, is contiguous RAM memory.

It’s partner is kfree – releases memory back

According to the seminar trainer, in kernel, it is recommended to allocate memory dynamically, because there is a memory limitation to kernel memory

Other memory allocation functions related to kmalloc:
kzalloc, kcalloc, krealloc

vmalloc – use where no need to allocate physically contiguous memory.

it’s partner is vfree – releases the memory

All memory allocation functions can use memset, memcpy functions and others

The memory allocation functions will always allocate memory in size of pages. if page is 512 KB, and I need 400 KB, the v/malloc will allocate 512 KB
This is where other allocation method come in handy:
SLAB Allocator – memory manager that manages small, fixed size allocations
Mainly used by linux core subsystems : file systems, networking etc.
Live stats in /proc/slabinfo, or command slabtop

Active / Total Objects (% used)    : 293652 / 296111 (99.2%)
Active / Total Slabs (% used)      : 5214 / 5214 (100.0%)
Active / Total Caches (% used)     : 72 / 103 (69.9%)
Active / Total Size (% used)       : 69509.67K / 70800.68K (98.2%)
Minimum / Average / Maximum Object : 0.01K / 0.24K / 8.00K

46512  46512 100%    0.12K    684       68      5472K kernfs_node_cache
36792  36792 100%    0.19K    876       42      7008K dentry
25704  24525  95%    0.04K    252      102      1008K selinux_inode_security
24896  24896 100%    0.06K    389       64      1556K kmalloc-64
17875  17431  97%    0.58K    325       55     10400K inode_cache
17834  17834 100%    0.21K    482       37      3856K vm_area_struct
16640  16640 100%    0.02K     65      256       260K kmalloc-16
13184  13184 100%    0.25K    206       64      3296K kmalloc-256
 9996   9996 100%    0.19K    238       42      1904K kmalloc-192

Function to create SLAB:

struct kmem_cache * kmem_cache_create (	const char *  	name,
 	size_t  	size,
 	size_t  	align,
 	unsigned long  	flags,
 	void (* 	ctor(void*, struct kmem_cache *, unsigned long),
 	void (* 	dtor(void*, struct kmem_cache *, unsigned long));


SLAB_NO_REAP – Protects the cache from being reduced.
SLAB_HWCACHE_ALIGN – each data object shall be aligned to a cache line.
SLAB_CACHE_DMA – Each data object should be allocated in ZONE_DMA

Releases the slab cache

void kmem_cache_destroy (struct kmem_cache * cachep);

SLOB – Simple List Of Blocks. Same as SLAB, but more efficient, but based on heap memory management. According to the trainer, rarely used.

SLUB – Simple List of Unqueued Blocks. Same as SLAB/SLOB. Better for multi-CPU machines. Scales better, but slower than SLAB.
SLUB is the default allocator !

I/O Memory & Ports

Linux still has ports like they used to be on the old Intel’s 8086 processor. These are virtual ports, but still accessible.

less /proc/ioports
0000-0cf7 : PCI Bus 0000:00
  0000-001f : dma1
  0020-0021 : PNP0001:00
    0020-0021 : pic1
  0040-0043 : timer0
  0050-0053 : timer1
  0060-0060 : keyboard
  0061-0061 : PNP0800:00
  0064-0064 : keyboard
  0070-0071 : rtc0
  0080-008f : dma page reg
  00a0-00a1 : PNP0001:00
    00a0-00a1 : pic2
  00c0-00df : dma2
  00f0-00ff : fpu
  0170-0177 : 0000:00:07.1
    0170-0177 : ata_piix
  01f0-01f7 : 0000:00:07.1
    01f0-01f7 : ata_piix
  0376-0376 : 0000:00:07.1
    0376-0376 : ata_piix
  03c0-03df : vga+
  03f6-03f6 : 0000:00:07.1
    03f6-03f6 : ata_piix
  03f8-03ff : serial
  04d0-04d1 : PNP0001:00
  0cf0-0cf1 : pnp 00:00
0cf8-0cff : PCI conf1
0d00-feff : PCI Bus 0000:00
  1000-103f : 0000:00:07.3
    1000-1003 : ACPI PM1a_EVT_BLK
    1004-1005 : ACPI PM1a_CNT_BLK
    1008-100b : ACPI PM_TMR
    100c-100f : ACPI GPE0_BLK
    1010-1015 : ACPI CPU throttle
  1040-104f : 0000:00:07.3
    1040-104f : pnp 00:00

Ports can be accessed with in / out commands:
unsigned inb (unsigned port);
void outb(unsigned port,unsigned word);

More modern method is to access I/O Memory

less /proc/iomem
00000000-00000fff : reserved
00001000-0009ebff : System RAM
0009ec00-0009ffff : reserved
000a0000-000bffff : PCI Bus 0000:00
000c0000-000c7fff : Video ROM
000ca000-000cafff : Adapter ROM
000cc000-000cffff : PCI Bus 0000:00
000d0000-000d3fff : PCI Bus 0000:00
000d4000-000d7fff : PCI Bus 0000:00
000d8000-000dbfff : PCI Bus 0000:00
000dc000-000fffff : reserved
  000f0000-000fffff : System ROM
00100000-bfecffff : System RAM
  2a000000-340fffff : Crash kernel
bfed0000-bfefefff : ACPI Tables
bfeff000-bfefffff : ACPI Non-volatile Storage
bff00000-bfffffff : System RAM
c0000000-febfffff : PCI Bus 0000:00
  c0000000-c0007fff : 0000:00:0f.0
  c0008000-c000bfff : 0000:00:10.0
  e5b00000-e5bfffff : PCI Bus 0000:22
  e5c00000-e5cfffff : PCI Bus 0000:1a
  e5d00000-e5dfffff : PCI Bus 0000:12
  e5e00000-e5efffff : PCI Bus 0000:0a
  e5f00000-e5ffffff : PCI Bus 0000:21
  e6000000-e60fffff : PCI Bus 0000:19
  e6100000-e61fffff : PCI Bus 0000:11
  e6200000-e62fffff : PCI Bus 0000:09
  e6300000-e63fffff : PCI Bus 0000:20
  e6400000-e64fffff : PCI Bus 0000:18
  e6500000-e65fffff : PCI Bus 0000:10
  e6600000-e66fffff : PCI Bus 0000:08
  e6700000-e67fffff : PCI Bus 0000:1f
  e6800000-e68fffff : PCI Bus 0000:17
  e6900000-e69fffff : PCI Bus 0000:0f
  e6a00000-e6afffff : PCI Bus 0000:07
  e6b00000-e6bfffff : PCI Bus 0000:1e

To use iomem in a driver need first to request memory region, and when done release. Functions: request_mem_region, release_mem_region.

Physical devices sit on physical addresses of the memory, however kernel works on virtual memory. There are function that map physical memory to virtual. Function ioremap return virtual address, Function iounmap releases virtual address mapping.

To read and write from and to the mapped memory use functions ioread8/16/32 and iowrite8/16/32. This is the portable and recommended way.
Also functions that do read / write repeatedly: ioread8_rep, iowrite8_rep
Additional functions for this: memset_io, memcpy_fromio, memcpy_toio.

Helpful functions from #include <asm/io.h>:
virt_to_phys – kernel virtual address to physical address
phys_to_virt – physical address to kernel virtual address
For platforms with IOMMU:
virt_to_bus / bus_to_virt – Translates DMA-bus addresses to virtual addresses and back

Sleeping in Kernel

Needed when process waiting for data.

To send process to sleep:
1. Declare a wait queue statically
wait_queue_head_t queue;

2. Make kernel process sleep: wait_event(queue, condition)
wait_event_interruptible(queue,condition); – will continue because or process called it or because of interrupt. Because of it a check needed to see the reason why this process continued, due to code or interrupt while waiting.

Waking Up – will wake all waiting processes in the queue

Interrupt handling

Interrupt handlers are chained. On every IRQ line can be several devices.
The kernel runs over all members of linked list , the chain, (orange box then yellow box). This function needs to check the device if it responsible for the interrupt. If the device didn’t called the interrupt the ISR handler returns some value, and continues to the next ISR handler. Chain stops when the device that called the interrupt is found.
If needed ISR handler can be removed from the chain.

The file cat /proc/interrupts will show interrupt for every processor:

   0:         58   IO-APIC-edge      timer
   1:         10   IO-APIC-edge      i8042
   8:          1   IO-APIC-edge      rtc0
   9:          0   IO-APIC-fasteoi   acpi
  12:         16   IO-APIC-edge      i8042
  14:          0   IO-APIC-edge      ata_piix
  15:       6041   IO-APIC-edge      ata_piix
  16:        653   IO-APIC-fasteoi   vmwgfx, snd_ens1371
  17:       8702   IO-APIC-fasteoi   ehci_hcd:usb1, ioc0
  18:         73   IO-APIC-fasteoi   uhci_hcd:usb2
  19:      35845   IO-APIC-fasteoi   ens33
  24:          0   PCI-MSI-edge      PCIe PME, pciehp
  25:          0   PCI-MSI-edge      PCIe PME, pciehp
  26:          0   PCI-MSI-edge      PCIe PME, pciehp
  27:          0   PCI-MSI-edge      PCIe PME, pciehp
  28:          0   PCI-MSI-edge      PCIe PME, pciehp
  29:          0   PCI-MSI-edge      PCIe PME, pciehp
  30:          0   PCI-MSI-edge      PCIe PME, pciehp
  31:          0   PCI-MSI-edge      PCIe PME, pciehp
  32:          0   PCI-MSI-edge      PCIe PME, pciehp
  33:          0   PCI-MSI-edge      PCIe PME, pciehp
  34:          0   PCI-MSI-edge      PCIe PME, pciehp
  35:          0   PCI-MSI-edge      PCIe PME, pciehp
  36:          0   PCI-MSI-edge      PCIe PME, pciehp
  37:          0   PCI-MSI-edge      PCIe PME, pciehp
  38:          0   PCI-MSI-edge      PCIe PME, pciehp
  39:          0   PCI-MSI-edge      PCIe PME, pciehp
  40:          0   PCI-MSI-edge      PCIe PME, pciehp
  41:          0   PCI-MSI-edge      PCIe PME, pciehp
  42:          0   PCI-MSI-edge      PCIe PME, pciehp
  43:          0   PCI-MSI-edge      PCIe PME, pciehp
  44:          0   PCI-MSI-edge      PCIe PME, pciehp
  45:          0   PCI-MSI-edge      PCIe PME, pciehp
  46:          0   PCI-MSI-edge      PCIe PME, pciehp
  47:          0   PCI-MSI-edge      PCIe PME, pciehp
  48:          0   PCI-MSI-edge      PCIe PME, pciehp
  49:          0   PCI-MSI-edge      PCIe PME, pciehp
  50:          0   PCI-MSI-edge      PCIe PME, pciehp
  51:          0   PCI-MSI-edge      PCIe PME, pciehp
  52:          0   PCI-MSI-edge      PCIe PME, pciehp
  53:          0   PCI-MSI-edge      PCIe PME, pciehp
  54:          0   PCI-MSI-edge      PCIe PME, pciehp
  55:          0   PCI-MSI-edge      PCIe PME, pciehp
  56:        773   PCI-MSI-edge      vmw_vmci
  57:          0   PCI-MSI-edge      vmw_vmci
 NMI:          0   Non-maskable interrupts
 LOC:     294243   Local timer interrupts
 SPU:          0   Spurious interrupts
 PMI:          0   Performance monitoring interrupts
 IWI:      31267   IRQ work interrupts
 RTR:          0   APIC ICR read retries
 RES:          0   Rescheduling interrupts
 CAL:          0   Function call interrupts
 TLB:          0   TLB shootdowns
 TRM:          0   Thermal event interrupts
 THR:          0   Threshold APIC interrupts
 DFR:          0   Deferred Error APIC interrupts
 MCE:          0   Machine check exceptions
 MCP:         21   Machine check polls
 ERR:          0
 MIS:          0
 PIN:          0   Posted-interrupt notification event
 NPI:          0   Nested posted-interrupt event
 PIW:          0   Posted-interrupt wakeup event

To register an interrupt into the chain use function request_irq from include/linux/interrupt.h

request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
	    const char *name, void *dev)

Interrupt handler tasks:
1. Clear interrupt flag from the device
2. Read/Write from/to the device
3. Wake UP processes waiting for this operation

Interrupt handler prototype
int irq,
void *dev_id);
Return value:
IRQ_HANDLED – recognized and handled interrupt
IRQ_NONE – not handled
IRQ_WAIT_THREAD – for threaded interrupt

Disabling interrupt on local CPU:
unsigned long (flags);

Must be run from the same function

Threaded interrupt

The OS want to minimize the time it handles interrupts. Threaded interrupts is method for it.
While in ISR, if in the chain, a routine found to handle an interrupt, instead if handling the interrupt, it added to thread queue and handled as any other code in the scheduler. Result the interrupt will be handled according to it’s priority.

Register interrupt using the function request_threaded_irq

int request_threaded_irq (	unsigned int irq,
 	irq_handler_t handler,
 	irq_handler_t thread_fn,
 	unsigned long irqflags,
 	const char * devname,
 	void * dev_id);