]> Pileus Git - ~andy/linux/blob - kernel/power/swsusp.c
[PATCH] swsusp: improve handling of swap partitions
[~andy/linux] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file provides code to write suspend image to swap and read it back.
5  *
6  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8  *
9  * This file is released under the GPLv2.
10  *
11  * I'd like to thank the following people for their work:
12  *
13  * Pavel Machek <pavel@ucw.cz>:
14  * Modifications, defectiveness pointing, being with me at the very beginning,
15  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16  *
17  * Steve Doddi <dirk@loth.demon.co.uk>:
18  * Support the possibility of hardware state restoring.
19  *
20  * Raph <grey.havens@earthling.net>:
21  * Support for preserving states of network devices and virtual console
22  * (including X and svgatextmode)
23  *
24  * Kurt Garloff <garloff@suse.de>:
25  * Straightened the critical function in order to prevent compilers from
26  * playing tricks with local variables.
27  *
28  * Andreas Mohr <a.mohr@mailto.de>
29  *
30  * Alex Badea <vampire@go.ro>:
31  * Fixed runaway init
32  *
33  * Rafael J. Wysocki <rjw@sisk.pl>
34  * Added the swap map data structure and reworked the handling of swap
35  *
36  * More state savers are welcome. Especially for the scsi layer...
37  *
38  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39  */
40
41 #include <linux/module.h>
42 #include <linux/mm.h>
43 #include <linux/suspend.h>
44 #include <linux/smp_lock.h>
45 #include <linux/file.h>
46 #include <linux/utsname.h>
47 #include <linux/version.h>
48 #include <linux/delay.h>
49 #include <linux/bitops.h>
50 #include <linux/spinlock.h>
51 #include <linux/genhd.h>
52 #include <linux/kernel.h>
53 #include <linux/major.h>
54 #include <linux/swap.h>
55 #include <linux/pm.h>
56 #include <linux/device.h>
57 #include <linux/buffer_head.h>
58 #include <linux/swapops.h>
59 #include <linux/bootmem.h>
60 #include <linux/syscalls.h>
61 #include <linux/highmem.h>
62 #include <linux/bio.h>
63
64 #include <asm/uaccess.h>
65 #include <asm/mmu_context.h>
66 #include <asm/pgtable.h>
67 #include <asm/tlbflush.h>
68 #include <asm/io.h>
69
70 #include "power.h"
71
72 /*
73  * Preferred image size in MB (tunable via /sys/power/image_size).
74  * When it is set to N, swsusp will do its best to ensure the image
75  * size will not exceed N MB, but if that is impossible, it will
76  * try to create the smallest image possible.
77  */
78 unsigned int image_size = 500;
79
80 #ifdef CONFIG_HIGHMEM
81 unsigned int count_highmem_pages(void);
82 int save_highmem(void);
83 int restore_highmem(void);
84 #else
85 static int save_highmem(void) { return 0; }
86 static int restore_highmem(void) { return 0; }
87 static unsigned int count_highmem_pages(void) { return 0; }
88 #endif
89
90 extern char resume_file[];
91
92 #define SWSUSP_SIG      "S1SUSPEND"
93
94 static struct swsusp_header {
95         char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
96         swp_entry_t swsusp_info;
97         char    orig_sig[10];
98         char    sig[10];
99 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
100
101 static struct swsusp_info swsusp_info;
102
103 /*
104  * Saving part...
105  */
106
107 static unsigned short root_swap = 0xffff;
108
109 static int mark_swapfiles(swp_entry_t prev)
110 {
111         int error;
112
113         rw_swap_page_sync(READ,
114                           swp_entry(root_swap, 0),
115                           virt_to_page((unsigned long)&swsusp_header));
116         if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
117             !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
118                 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
119                 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
120                 swsusp_header.swsusp_info = prev;
121                 error = rw_swap_page_sync(WRITE,
122                                           swp_entry(root_swap, 0),
123                                           virt_to_page((unsigned long)
124                                                        &swsusp_header));
125         } else {
126                 pr_debug("swsusp: Partition is not swap space.\n");
127                 error = -ENODEV;
128         }
129         return error;
130 }
131
132 /*
133  * Check whether the swap device is the specified resume
134  * device, irrespective of whether they are specified by
135  * identical names.
136  *
137  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
138  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
139  * and they'll be considered the same device.  This is *necessary* for
140  * devfs, since the resume code can only recognize the form /dev/hda4,
141  * but the suspend code would see the long name.)
142  */
143 static inline int is_resume_device(const struct swap_info_struct *swap_info)
144 {
145         struct file *file = swap_info->swap_file;
146         struct inode *inode = file->f_dentry->d_inode;
147
148         return S_ISBLK(inode->i_mode) &&
149                 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
150 }
151
152 static int swsusp_swap_check(void) /* This is called before saving image */
153 {
154         int i;
155
156         if (!swsusp_resume_device)
157                 return -ENODEV;
158         spin_lock(&swap_lock);
159         for (i = 0; i < MAX_SWAPFILES; i++) {
160                 if (!(swap_info[i].flags & SWP_WRITEOK))
161                         continue;
162                 if (is_resume_device(swap_info + i)) {
163                         spin_unlock(&swap_lock);
164                         root_swap = i;
165                         return 0;
166                 }
167         }
168         spin_unlock(&swap_lock);
169         return -ENODEV;
170 }
171
172 /**
173  *      write_page - Write one page to a fresh swap location.
174  *      @addr:  Address we're writing.
175  *      @loc:   Place to store the entry we used.
176  *
177  *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
178  *      errors. That is an artifact left over from swsusp. It did not
179  *      check the return of rw_swap_page_sync() at all, since most pages
180  *      written back to swap would return -EIO.
181  *      This is a partial improvement, since we will at least return other
182  *      errors, though we need to eventually fix the damn code.
183  */
184 static int write_page(unsigned long addr, swp_entry_t *loc)
185 {
186         swp_entry_t entry;
187         int error = -ENOSPC;
188
189         entry = get_swap_page_of_type(root_swap);
190         if (swp_offset(entry)) {
191                 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
192                 if (!error || error == -EIO)
193                         *loc = entry;
194         }
195         return error;
196 }
197
198 /**
199  *      Swap map-handling functions
200  *
201  *      The swap map is a data structure used for keeping track of each page
202  *      written to the swap.  It consists of many swap_map_page structures
203  *      that contain each an array of MAP_PAGE_SIZE swap entries.
204  *      These structures are linked together with the help of either the
205  *      .next (in memory) or the .next_swap (in swap) member.
206  *
207  *      The swap map is created during suspend.  At that time we need to keep
208  *      it in memory, because we have to free all of the allocated swap
209  *      entries if an error occurs.  The memory needed is preallocated
210  *      so that we know in advance if there's enough of it.
211  *
212  *      The first swap_map_page structure is filled with the swap entries that
213  *      correspond to the first MAP_PAGE_SIZE data pages written to swap and
214  *      so on.  After the all of the data pages have been written, the order
215  *      of the swap_map_page structures in the map is reversed so that they
216  *      can be read from swap in the original order.  This causes the data
217  *      pages to be loaded in exactly the same order in which they have been
218  *      saved.
219  *
220  *      During resume we only need to use one swap_map_page structure
221  *      at a time, which means that we only need to use two memory pages for
222  *      reading the image - one for reading the swap_map_page structures
223  *      and the second for reading the data pages from swap.
224  */
225
226 #define MAP_PAGE_SIZE   ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
227                         / sizeof(swp_entry_t))
228
229 struct swap_map_page {
230         swp_entry_t             entries[MAP_PAGE_SIZE];
231         swp_entry_t             next_swap;
232         struct swap_map_page    *next;
233 };
234
235 static inline void free_swap_map(struct swap_map_page *swap_map)
236 {
237         struct swap_map_page *swp;
238
239         while (swap_map) {
240                 swp = swap_map->next;
241                 free_page((unsigned long)swap_map);
242                 swap_map = swp;
243         }
244 }
245
246 static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
247 {
248         struct swap_map_page *swap_map, *swp;
249         unsigned n = 0;
250
251         if (!nr_pages)
252                 return NULL;
253
254         pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
255         swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
256         swp = swap_map;
257         for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
258                 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
259                 swp = swp->next;
260                 if (!swp) {
261                         free_swap_map(swap_map);
262                         return NULL;
263                 }
264         }
265         return swap_map;
266 }
267
268 /**
269  *      reverse_swap_map - reverse the order of pages in the swap map
270  *      @swap_map
271  */
272
273 static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
274 {
275         struct swap_map_page *prev, *next;
276
277         prev = NULL;
278         while (swap_map) {
279                 next = swap_map->next;
280                 swap_map->next = prev;
281                 prev = swap_map;
282                 swap_map = next;
283         }
284         return prev;
285 }
286
287 /**
288  *      free_swap_map_entries - free the swap entries allocated to store
289  *      the swap map @swap_map (this is only called in case of an error)
290  */
291 static inline void free_swap_map_entries(struct swap_map_page *swap_map)
292 {
293         while (swap_map) {
294                 if (swap_map->next_swap.val)
295                         swap_free(swap_map->next_swap);
296                 swap_map = swap_map->next;
297         }
298 }
299
300 /**
301  *      save_swap_map - save the swap map used for tracing the data pages
302  *      stored in the swap
303  */
304
305 static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
306 {
307         swp_entry_t entry = (swp_entry_t){0};
308         int error;
309
310         while (swap_map) {
311                 swap_map->next_swap = entry;
312                 if ((error = write_page((unsigned long)swap_map, &entry)))
313                         return error;
314                 swap_map = swap_map->next;
315         }
316         *start = entry;
317         return 0;
318 }
319
320 /**
321  *      free_image_entries - free the swap entries allocated to store
322  *      the image data pages (this is only called in case of an error)
323  */
324
325 static inline void free_image_entries(struct swap_map_page *swp)
326 {
327         unsigned k;
328
329         while (swp) {
330                 for (k = 0; k < MAP_PAGE_SIZE; k++)
331                         if (swp->entries[k].val)
332                                 swap_free(swp->entries[k]);
333                 swp = swp->next;
334         }
335 }
336
337 /**
338  *      The swap_map_handle structure is used for handling the swap map in
339  *      a file-alike way
340  */
341
342 struct swap_map_handle {
343         struct swap_map_page *cur;
344         unsigned int k;
345 };
346
347 static inline void init_swap_map_handle(struct swap_map_handle *handle,
348                                         struct swap_map_page *map)
349 {
350         handle->cur = map;
351         handle->k = 0;
352 }
353
354 static inline int swap_map_write_page(struct swap_map_handle *handle,
355                                       unsigned long addr)
356 {
357         int error;
358
359         error = write_page(addr, handle->cur->entries + handle->k);
360         if (error)
361                 return error;
362         if (++handle->k >= MAP_PAGE_SIZE) {
363                 handle->cur = handle->cur->next;
364                 handle->k = 0;
365         }
366         return 0;
367 }
368
369 /**
370  *      save_image_data - save the data pages pointed to by the PBEs
371  *      from the list @pblist using the swap map handle @handle
372  *      (assume there are @nr_pages data pages to save)
373  */
374
375 static int save_image_data(struct pbe *pblist,
376                            struct swap_map_handle *handle,
377                            unsigned int nr_pages)
378 {
379         unsigned int m;
380         struct pbe *p;
381         int error = 0;
382
383         printk("Saving image data pages (%u pages) ...     ", nr_pages);
384         m = nr_pages / 100;
385         if (!m)
386                 m = 1;
387         nr_pages = 0;
388         for_each_pbe (p, pblist) {
389                 error = swap_map_write_page(handle, p->address);
390                 if (error)
391                         break;
392                 if (!(nr_pages % m))
393                         printk("\b\b\b\b%3d%%", nr_pages / m);
394                 nr_pages++;
395         }
396         if (!error)
397                 printk("\b\b\b\bdone\n");
398         return error;
399 }
400
401 static void dump_info(void)
402 {
403         pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
404         pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
405         pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
406         pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
407         pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
408         pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
409         pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
410         pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
411         pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
412         pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
413         pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
414 }
415
416 static void init_header(unsigned int nr_pages)
417 {
418         memset(&swsusp_info, 0, sizeof(swsusp_info));
419         swsusp_info.version_code = LINUX_VERSION_CODE;
420         swsusp_info.num_physpages = num_physpages;
421         memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
422
423         swsusp_info.cpus = num_online_cpus();
424         swsusp_info.image_pages = nr_pages;
425         swsusp_info.pages = nr_pages +
426                 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
427 }
428
429 static int close_swap(void)
430 {
431         swp_entry_t entry;
432         int error;
433
434         dump_info();
435         error = write_page((unsigned long)&swsusp_info, &entry);
436         if (!error) {
437                 printk( "S" );
438                 error = mark_swapfiles(entry);
439                 printk( "|\n" );
440         }
441         return error;
442 }
443
444 /**
445  *      pack_orig_addresses - the .orig_address fields of the PBEs from the
446  *      list starting at @pbe are stored in the array @buf[] (1 page)
447  */
448
449 static inline struct pbe *pack_orig_addresses(unsigned long *buf,
450                                               struct pbe *pbe)
451 {
452         int j;
453
454         for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
455                 buf[j] = pbe->orig_address;
456                 pbe = pbe->next;
457         }
458         if (!pbe)
459                 for (; j < PAGE_SIZE / sizeof(long); j++)
460                         buf[j] = 0;
461         return pbe;
462 }
463
464 /**
465  *      save_image_metadata - save the .orig_address fields of the PBEs
466  *      from the list @pblist using the swap map handle @handle
467  */
468
469 static int save_image_metadata(struct pbe *pblist,
470                                struct swap_map_handle *handle)
471 {
472         unsigned long *buf;
473         unsigned int n = 0;
474         struct pbe *p;
475         int error = 0;
476
477         printk("Saving image metadata ... ");
478         buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
479         if (!buf)
480                 return -ENOMEM;
481         p = pblist;
482         while (p) {
483                 p = pack_orig_addresses(buf, p);
484                 error = swap_map_write_page(handle, (unsigned long)buf);
485                 if (error)
486                         break;
487                 n++;
488         }
489         free_page((unsigned long)buf);
490         if (!error)
491                 printk("done (%u pages saved)\n", n);
492         return error;
493 }
494
495 /**
496  *      enough_swap - Make sure we have enough swap to save the image.
497  *
498  *      Returns TRUE or FALSE after checking the total amount of swap
499  *      space avaiable from the resume partition.
500  */
501
502 static int enough_swap(unsigned int nr_pages)
503 {
504         unsigned int free_swap = swap_info[root_swap].pages -
505                 swap_info[root_swap].inuse_pages;
506
507         pr_debug("swsusp: free swap pages: %u\n", free_swap);
508         return free_swap > (nr_pages + PAGES_FOR_IO +
509                 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
510 }
511
512 /**
513  *      swsusp_write - Write entire image and metadata.
514  *
515  *      It is important _NOT_ to umount filesystems at this point. We want
516  *      them synced (in case something goes wrong) but we DO not want to mark
517  *      filesystem clean: it is not. (And it does not matter, if we resume
518  *      correctly, we'll mark system clean, anyway.)
519  */
520
521 int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
522 {
523         struct swap_map_page *swap_map;
524         struct swap_map_handle handle;
525         int error;
526
527         if ((error = swsusp_swap_check())) {
528                 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
529                 return error;
530         }
531         if (!enough_swap(nr_pages)) {
532                 printk(KERN_ERR "swsusp: Not enough free swap\n");
533                 return -ENOSPC;
534         }
535
536         init_header(nr_pages);
537         swap_map = alloc_swap_map(swsusp_info.pages);
538         if (!swap_map)
539                 return -ENOMEM;
540         init_swap_map_handle(&handle, swap_map);
541
542         error = save_image_metadata(pblist, &handle);
543         if (!error)
544                 error = save_image_data(pblist, &handle, nr_pages);
545         if (error)
546                 goto Free_image_entries;
547
548         swap_map = reverse_swap_map(swap_map);
549         error = save_swap_map(swap_map, &swsusp_info.start);
550         if (error)
551                 goto Free_map_entries;
552
553         error = close_swap();
554         if (error)
555                 goto Free_map_entries;
556
557 Free_swap_map:
558         free_swap_map(swap_map);
559         return error;
560
561 Free_map_entries:
562         free_swap_map_entries(swap_map);
563 Free_image_entries:
564         free_image_entries(swap_map);
565         goto Free_swap_map;
566 }
567
568 /**
569  *      swsusp_shrink_memory -  Try to free as much memory as needed
570  *
571  *      ... but do not OOM-kill anyone
572  *
573  *      Notice: all userland should be stopped before it is called, or
574  *      livelock is possible.
575  */
576
577 #define SHRINK_BITE     10000
578
579 int swsusp_shrink_memory(void)
580 {
581         long size, tmp;
582         struct zone *zone;
583         unsigned long pages = 0;
584         unsigned int i = 0;
585         char *p = "-\\|/";
586
587         printk("Shrinking memory...  ");
588         do {
589                 size = 2 * count_highmem_pages();
590                 size += size / 50 + count_data_pages();
591                 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
592                         PAGES_FOR_IO;
593                 tmp = size;
594                 for_each_zone (zone)
595                         if (!is_highmem(zone))
596                                 tmp -= zone->free_pages;
597                 if (tmp > 0) {
598                         tmp = shrink_all_memory(SHRINK_BITE);
599                         if (!tmp)
600                                 return -ENOMEM;
601                         pages += tmp;
602                 } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
603                         tmp = shrink_all_memory(SHRINK_BITE);
604                         pages += tmp;
605                 }
606                 printk("\b%c", p[i++%4]);
607         } while (tmp > 0);
608         printk("\bdone (%lu pages freed)\n", pages);
609
610         return 0;
611 }
612
613 int swsusp_suspend(void)
614 {
615         int error;
616
617         if ((error = arch_prepare_suspend()))
618                 return error;
619         local_irq_disable();
620         /* At this point, device_suspend() has been called, but *not*
621          * device_power_down(). We *must* device_power_down() now.
622          * Otherwise, drivers for some devices (e.g. interrupt controllers)
623          * become desynchronized with the actual state of the hardware
624          * at resume time, and evil weirdness ensues.
625          */
626         if ((error = device_power_down(PMSG_FREEZE))) {
627                 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
628                 goto Enable_irqs;
629         }
630
631         if ((error = save_highmem())) {
632                 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
633                 goto Restore_highmem;
634         }
635
636         save_processor_state();
637         if ((error = swsusp_arch_suspend()))
638                 printk(KERN_ERR "Error %d suspending\n", error);
639         /* Restore control flow magically appears here */
640         restore_processor_state();
641 Restore_highmem:
642         restore_highmem();
643         device_power_up();
644 Enable_irqs:
645         local_irq_enable();
646         return error;
647 }
648
649 int swsusp_resume(void)
650 {
651         int error;
652         local_irq_disable();
653         if (device_power_down(PMSG_FREEZE))
654                 printk(KERN_ERR "Some devices failed to power down, very bad\n");
655         /* We'll ignore saved state, but this gets preempt count (etc) right */
656         save_processor_state();
657         error = swsusp_arch_resume();
658         /* Code below is only ever reached in case of failure. Otherwise
659          * execution continues at place where swsusp_arch_suspend was called
660          */
661         BUG_ON(!error);
662         /* The only reason why swsusp_arch_resume() can fail is memory being
663          * very tight, so we have to free it as soon as we can to avoid
664          * subsequent failures
665          */
666         swsusp_free();
667         restore_processor_state();
668         restore_highmem();
669         touch_softlockup_watchdog();
670         device_power_up();
671         local_irq_enable();
672         return error;
673 }
674
675 /**
676  *      mark_unsafe_pages - mark the pages that cannot be used for storing
677  *      the image during resume, because they conflict with the pages that
678  *      had been used before suspend
679  */
680
681 static void mark_unsafe_pages(struct pbe *pblist)
682 {
683         struct zone *zone;
684         unsigned long zone_pfn;
685         struct pbe *p;
686
687         if (!pblist) /* a sanity check */
688                 return;
689
690         /* Clear page flags */
691         for_each_zone (zone) {
692                 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
693                         if (pfn_valid(zone_pfn + zone->zone_start_pfn))
694                                 ClearPageNosaveFree(pfn_to_page(zone_pfn +
695                                         zone->zone_start_pfn));
696         }
697
698         /* Mark orig addresses */
699         for_each_pbe (p, pblist)
700                 SetPageNosaveFree(virt_to_page(p->orig_address));
701
702 }
703
704 static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
705 {
706         /* We assume both lists contain the same number of elements */
707         while (src) {
708                 dst->orig_address = src->orig_address;
709                 dst = dst->next;
710                 src = src->next;
711         }
712 }
713
714 /*
715  *      Using bio to read from swap.
716  *      This code requires a bit more work than just using buffer heads
717  *      but, it is the recommended way for 2.5/2.6.
718  *      The following are to signal the beginning and end of I/O. Bios
719  *      finish asynchronously, while we want them to happen synchronously.
720  *      A simple atomic_t, and a wait loop take care of this problem.
721  */
722
723 static atomic_t io_done = ATOMIC_INIT(0);
724
725 static int end_io(struct bio *bio, unsigned int num, int err)
726 {
727         if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
728                 panic("I/O error reading memory image");
729         atomic_set(&io_done, 0);
730         return 0;
731 }
732
733 static struct block_device *resume_bdev;
734
735 /**
736  *      submit - submit BIO request.
737  *      @rw:    READ or WRITE.
738  *      @off    physical offset of page.
739  *      @page:  page we're reading or writing.
740  *
741  *      Straight from the textbook - allocate and initialize the bio.
742  *      If we're writing, make sure the page is marked as dirty.
743  *      Then submit it and wait.
744  */
745
746 static int submit(int rw, pgoff_t page_off, void *page)
747 {
748         int error = 0;
749         struct bio *bio;
750
751         bio = bio_alloc(GFP_ATOMIC, 1);
752         if (!bio)
753                 return -ENOMEM;
754         bio->bi_sector = page_off * (PAGE_SIZE >> 9);
755         bio_get(bio);
756         bio->bi_bdev = resume_bdev;
757         bio->bi_end_io = end_io;
758
759         if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
760                 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
761                 error = -EFAULT;
762                 goto Done;
763         }
764
765         if (rw == WRITE)
766                 bio_set_pages_dirty(bio);
767
768         atomic_set(&io_done, 1);
769         submit_bio(rw | (1 << BIO_RW_SYNC), bio);
770         while (atomic_read(&io_done))
771                 yield();
772
773  Done:
774         bio_put(bio);
775         return error;
776 }
777
778 static int bio_read_page(pgoff_t page_off, void *page)
779 {
780         return submit(READ, page_off, page);
781 }
782
783 static int bio_write_page(pgoff_t page_off, void *page)
784 {
785         return submit(WRITE, page_off, page);
786 }
787
788 /**
789  *      The following functions allow us to read data using a swap map
790  *      in a file-alike way
791  */
792
793 static inline void release_swap_map_reader(struct swap_map_handle *handle)
794 {
795         if (handle->cur)
796                 free_page((unsigned long)handle->cur);
797         handle->cur = NULL;
798 }
799
800 static inline int get_swap_map_reader(struct swap_map_handle *handle,
801                                       swp_entry_t start)
802 {
803         int error;
804
805         if (!swp_offset(start))
806                 return -EINVAL;
807         handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
808         if (!handle->cur)
809                 return -ENOMEM;
810         error = bio_read_page(swp_offset(start), handle->cur);
811         if (error) {
812                 release_swap_map_reader(handle);
813                 return error;
814         }
815         handle->k = 0;
816         return 0;
817 }
818
819 static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
820 {
821         unsigned long offset;
822         int error;
823
824         if (!handle->cur)
825                 return -EINVAL;
826         offset = swp_offset(handle->cur->entries[handle->k]);
827         if (!offset)
828                 return -EINVAL;
829         error = bio_read_page(offset, buf);
830         if (error)
831                 return error;
832         if (++handle->k >= MAP_PAGE_SIZE) {
833                 handle->k = 0;
834                 offset = swp_offset(handle->cur->next_swap);
835                 if (!offset)
836                         release_swap_map_reader(handle);
837                 else
838                         error = bio_read_page(offset, handle->cur);
839         }
840         return error;
841 }
842
843 /*
844  * Sanity check if this image makes sense with this kernel/swap context
845  * I really don't think that it's foolproof but more than nothing..
846  */
847
848 static const char *sanity_check(void)
849 {
850         dump_info();
851         if (swsusp_info.version_code != LINUX_VERSION_CODE)
852                 return "kernel version";
853         if (swsusp_info.num_physpages != num_physpages)
854                 return "memory size";
855         if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
856                 return "system type";
857         if (strcmp(swsusp_info.uts.release,system_utsname.release))
858                 return "kernel release";
859         if (strcmp(swsusp_info.uts.version,system_utsname.version))
860                 return "version";
861         if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
862                 return "machine";
863 #if 0
864         /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
865         if (swsusp_info.cpus != num_possible_cpus())
866                 return "number of cpus";
867 #endif
868         return NULL;
869 }
870
871 static int check_header(void)
872 {
873         const char *reason = NULL;
874         int error;
875
876         if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
877                 return error;
878
879         /* Is this same machine? */
880         if ((reason = sanity_check())) {
881                 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
882                 return -EPERM;
883         }
884         return error;
885 }
886
887 static int check_sig(void)
888 {
889         int error;
890
891         memset(&swsusp_header, 0, sizeof(swsusp_header));
892         if ((error = bio_read_page(0, &swsusp_header)))
893                 return error;
894         if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
895                 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
896
897                 /*
898                  * Reset swap signature now.
899                  */
900                 error = bio_write_page(0, &swsusp_header);
901         } else {
902                 return -EINVAL;
903         }
904         if (!error)
905                 pr_debug("swsusp: Signature found, resuming\n");
906         return error;
907 }
908
909 /**
910  *      load_image_data - load the image data using the swap map handle
911  *      @handle and store them using the page backup list @pblist
912  *      (assume there are @nr_pages pages to load)
913  */
914
915 static int load_image_data(struct pbe *pblist,
916                            struct swap_map_handle *handle,
917                            unsigned int nr_pages)
918 {
919         int error;
920         unsigned int m;
921         struct pbe *p;
922
923         if (!pblist)
924                 return -EINVAL;
925         printk("Loading image data pages (%u pages) ...     ", nr_pages);
926         m = nr_pages / 100;
927         if (!m)
928                 m = 1;
929         nr_pages = 0;
930         p = pblist;
931         while (p) {
932                 error = swap_map_read_page(handle, (void *)p->address);
933                 if (error)
934                         break;
935                 p = p->next;
936                 if (!(nr_pages % m))
937                         printk("\b\b\b\b%3d%%", nr_pages / m);
938                 nr_pages++;
939         }
940         if (!error)
941                 printk("\b\b\b\bdone\n");
942         return error;
943 }
944
945 /**
946  *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
947  *      the PBEs in the list starting at @pbe
948  */
949
950 static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
951                                                 struct pbe *pbe)
952 {
953         int j;
954
955         for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
956                 pbe->orig_address = buf[j];
957                 pbe = pbe->next;
958         }
959         return pbe;
960 }
961
962 /**
963  *      load_image_metadata - load the image metadata using the swap map
964  *      handle @handle and put them into the PBEs in the list @pblist
965  */
966
967 static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
968 {
969         struct pbe *p;
970         unsigned long *buf;
971         unsigned int n = 0;
972         int error = 0;
973
974         printk("Loading image metadata ... ");
975         buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
976         if (!buf)
977                 return -ENOMEM;
978         p = pblist;
979         while (p) {
980                 error = swap_map_read_page(handle, buf);
981                 if (error)
982                         break;
983                 p = unpack_orig_addresses(buf, p);
984                 n++;
985         }
986         free_page((unsigned long)buf);
987         if (!error)
988                 printk("done (%u pages loaded)\n", n);
989         return error;
990 }
991
992 static int check_suspend_image(void)
993 {
994         int error = 0;
995
996         if ((error = check_sig()))
997                 return error;
998
999         if ((error = check_header()))
1000                 return error;
1001
1002         return 0;
1003 }
1004
1005 static int read_suspend_image(struct pbe **pblist_ptr)
1006 {
1007         int error = 0;
1008         struct pbe *p, *pblist;
1009         struct swap_map_handle handle;
1010         unsigned int nr_pages = swsusp_info.image_pages;
1011
1012         p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
1013         if (!p)
1014                 return -ENOMEM;
1015         error = get_swap_map_reader(&handle, swsusp_info.start);
1016         if (error)
1017                 /* The PBE list at p will be released by swsusp_free() */
1018                 return error;
1019         error = load_image_metadata(p, &handle);
1020         if (!error) {
1021                 mark_unsafe_pages(p);
1022                 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
1023                 if (pblist)
1024                         copy_page_backup_list(pblist, p);
1025                 free_pagedir(p);
1026                 if (!pblist)
1027                         error = -ENOMEM;
1028
1029                 /* Allocate memory for the image and read the data from swap */
1030                 if (!error)
1031                         error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
1032                 if (!error) {
1033                         release_eaten_pages();
1034                         error = load_image_data(pblist, &handle, nr_pages);
1035                 }
1036                 if (!error)
1037                         *pblist_ptr = pblist;
1038         }
1039         release_swap_map_reader(&handle);
1040         return error;
1041 }
1042
1043 /**
1044  *      swsusp_check - Check for saved image in swap
1045  */
1046
1047 int swsusp_check(void)
1048 {
1049         int error;
1050
1051         resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1052         if (!IS_ERR(resume_bdev)) {
1053                 set_blocksize(resume_bdev, PAGE_SIZE);
1054                 error = check_suspend_image();
1055                 if (error)
1056                     blkdev_put(resume_bdev);
1057         } else
1058                 error = PTR_ERR(resume_bdev);
1059
1060         if (!error)
1061                 pr_debug("swsusp: resume file found\n");
1062         else
1063                 pr_debug("swsusp: Error %d check for resume file\n", error);
1064         return error;
1065 }
1066
1067 /**
1068  *      swsusp_read - Read saved image from swap.
1069  */
1070
1071 int swsusp_read(struct pbe **pblist_ptr)
1072 {
1073         int error;
1074
1075         if (IS_ERR(resume_bdev)) {
1076                 pr_debug("swsusp: block device not initialised\n");
1077                 return PTR_ERR(resume_bdev);
1078         }
1079
1080         error = read_suspend_image(pblist_ptr);
1081         blkdev_put(resume_bdev);
1082
1083         if (!error)
1084                 pr_debug("swsusp: Reading resume file was successful\n");
1085         else
1086                 pr_debug("swsusp: Error %d resuming\n", error);
1087         return error;
1088 }
1089
1090 /**
1091  *      swsusp_close - close swap device.
1092  */
1093
1094 void swsusp_close(void)
1095 {
1096         if (IS_ERR(resume_bdev)) {
1097                 pr_debug("swsusp: block device not initialised\n");
1098                 return;
1099         }
1100
1101         blkdev_put(resume_bdev);
1102 }