Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 871 - (show annotations) (download) (as text)
Sat Oct 15 13:45:34 2011 UTC (13 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 20132 byte(s)
Bug fixes for the previous change.

1 /***************************************
2 Merge sort functions.
3
4 Part of the Routino routing software.
5 ******************/ /******************
6 This file Copyright 2009-2011 Andrew M. Bishop
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ***************************************/
21
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <assert.h>
27
28 #include "types.h"
29
30 #include "files.h"
31 #include "sorting.h"
32
33
34 /* Global variables */
35
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
38
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
41
42
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
46
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
52
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54
55 int fd_out The file descriptor of the output file (opened for writing and empty).
56
57 size_t itemsize The size of each item in the file that needs sorting.
58
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
61
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
65
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 {
68 int *fds=NULL,*heap=NULL;
69 int nfiles=0,ndata=0;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
73 char *filename;
74 int i,more=1;
75
76 /* Allocate the RAM buffer and other bits */
77
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
80
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
84
85 do
86 {
87 int fd,n=0;
88
89 /* Read in the data and create pointers */
90
91 for(i=0;i<nitems;i++)
92 {
93 datap[i]=data+i*itemsize;
94
95 if(ReadFile(fd_in,datap[i],itemsize))
96 {
97 more=0;
98 break;
99 }
100
101 total++;
102 }
103
104 n=i;
105
106 /* Shortcut if there is no data and no previous files (i.e. no data at all) */
107
108 if(nfiles==0 && n==0)
109 goto tidy_and_exit;
110
111 /* No new data read in this time round */
112
113 if(n==0)
114 break;
115
116 /* Sort the data pointers using a heap sort */
117
118 filesort_heapsort(datap,n,compare);
119
120 /* Shortcut if all read in and sorted at once */
121
122 if(nfiles==0 && !more)
123 {
124 for(i=0;i<n;i++)
125 {
126 if(!buildindex || buildindex(datap[i],count))
127 {
128 WriteFile(fd_out,datap[i],itemsize);
129 count++;
130 }
131 }
132
133 goto tidy_and_exit;
134 }
135
136 /* Create a temporary file and write the result */
137
138 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
139
140 fd=OpenFileNew(filename);
141
142 for(i=0;i<n;i++)
143 WriteFile(fd,datap[i],itemsize);
144
145 CloseFile(fd);
146
147 nfiles++;
148 }
149 while(more);
150
151 /* Shortcut if only one file (unlucky for us there must have been exactly
152 nitems, lucky for us we still have the data in RAM) */
153
154 if(nfiles==1)
155 {
156 for(i=0;i<nitems;i++)
157 {
158 if(!buildindex || buildindex(datap[i],count))
159 {
160 WriteFile(fd_out,datap[i],itemsize);
161 count++;
162 }
163 }
164
165 DeleteFile(filename);
166
167 goto tidy_and_exit;
168 }
169
170 /* Check that number of files is less than file size */
171
172 assert(nfiles<nitems);
173
174 /* Open all of the temporary files */
175
176 fds=(int*)malloc(nfiles*sizeof(int));
177
178 for(i=0;i<nfiles;i++)
179 {
180 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
181
182 fds[i]=ReOpenFile(filename);
183
184 DeleteFile(filename);
185 }
186
187 /* Perform an n-way merge using a binary heap */
188
189 heap=(int*)malloc(nfiles*sizeof(int));
190
191 /* Fill the heap to start with */
192
193 for(i=0;i<nfiles;i++)
194 {
195 int index;
196
197 datap[i]=data+i*itemsize;
198
199 ReadFile(fds[i],datap[i],itemsize);
200
201 index=i;
202
203 heap[index]=i;
204
205 /* Bubble up the new value */
206
207 while(index>0)
208 {
209 int newindex;
210 int temp;
211
212 newindex=(index-1)/4;
213
214 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
215 break;
216
217 temp=heap[index];
218 heap[index]=heap[newindex];
219 heap[newindex]=temp;
220
221 index=newindex;
222 }
223 }
224
225 /* Repeatedly pull out the root of the heap and refill from the same file */
226
227 ndata=nfiles;
228
229 do
230 {
231 int index=0;
232
233 if(!buildindex || buildindex(datap[heap[index]],count))
234 {
235 WriteFile(fd_out,datap[heap[index]],itemsize);
236 count++;
237 }
238
239 if(ReadFile(fds[heap[index]],datap[heap[index]],itemsize))
240 {
241 ndata--;
242 heap[index]=heap[ndata];
243 }
244
245 /* Bubble down the new value */
246
247 while((4*index+4)<ndata)
248 {
249 int childindex,newindex;
250 int temp;
251
252 childindex=newindex=4*index+1;
253
254 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
255 newindex=childindex;
256 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
257 newindex=childindex;
258 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
259 newindex=childindex;
260
261 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
262 break;
263
264 temp=heap[newindex];
265 heap[newindex]=heap[index];
266 heap[index]=temp;
267
268 index=newindex;
269 }
270
271 if((4*index+4)==ndata)
272 {
273 int childindex,newindex;
274 int temp;
275
276 childindex=newindex=4*index+1;
277
278 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
279 newindex=childindex;
280 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
281 newindex=childindex;
282
283 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
284 ; /* break */
285 else
286 {
287 temp=heap[newindex];
288 heap[newindex]=heap[index];
289 heap[index]=temp;
290 }
291 }
292
293 else if((4*index+3)==ndata)
294 {
295 int childindex,newindex;
296 int temp;
297
298 childindex=newindex=4*index+1;
299
300 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
301 newindex=childindex;
302
303 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
304 ; /* break */
305 else
306 {
307 temp=heap[newindex];
308 heap[newindex]=heap[index];
309 heap[index]=temp;
310 }
311 }
312
313 else if((4*index+2)==ndata)
314 {
315 int newindex;
316 int temp;
317
318 newindex=4*index+1;
319
320 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
321 ; /* break */
322 else
323 {
324 temp=heap[newindex];
325 heap[newindex]=heap[index];
326 heap[index]=temp;
327 }
328 }
329 }
330 while(ndata>0);
331
332 /* Tidy up */
333
334 tidy_and_exit:
335
336 if(fds)
337 {
338 for(i=0;i<nfiles;i++)
339 CloseFile(fds[i]);
340 free(fds);
341 }
342
343 if(heap)
344 free(heap);
345
346 free(data);
347 free(datap);
348 free(filename);
349 }
350
351
352 /*++++++++++++++++++++++++++++++++++++++
353 A function to sort the contents of a file of variable length objects (each
354 preceded by its length in FILESORT_VARSIZE bytes) using a limited amount of RAM.
355
356 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
357 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
358 The individual sort steps and the merge step both use a "Heap sort"
359 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
360 if the data is already partially sorted.
361
362 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
363
364 int fd_out The file descriptor of the output file (opened for writing and empty).
365
366 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
367 data to be sorted is an array of things not pointers).
368
369 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
370 returns 1 then it is written to the output file.
371 ++++++++++++++++++++++++++++++++++++++*/
372
373 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
374 {
375 int *fds=NULL,*heap=NULL;
376 int nfiles=0,ndata=0;
377 index_t count=0,total=0;
378 FILESORT_VARINT nextitemsize,largestitemsize=0;
379 void *data=NULL,**datap=NULL;
380 char *filename;
381 int i,more=1;
382
383 /* Allocate the RAM buffer and other bits */
384
385 data=malloc(option_filesort_ramsize);
386
387 filename=(char*)malloc(strlen(option_tmpdirname)+24);
388
389 /* Loop around, fill the buffer, sort the data and write a temporary file */
390
391 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
392 goto tidy_and_exit;
393
394 do
395 {
396 int fd,n=0;
397 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
398
399 datap=data+option_filesort_ramsize;
400
401 /* Read in the data and create pointers */
402
403 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
404 {
405 FILESORT_VARINT itemsize=nextitemsize;
406
407 if(itemsize>largestitemsize)
408 largestitemsize=itemsize;
409
410 *(FILESORT_VARINT*)(data+ramused)=itemsize;
411
412 ramused+=FILESORT_VARSIZE;
413
414 ReadFile(fd_in,data+ramused,itemsize);
415
416 *--datap=data+ramused; /* points to real data */
417
418 ramused+=itemsize;
419
420 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
421 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
422
423 total++;
424 n++;
425
426 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
427 {
428 more=0;
429 break;
430 }
431 }
432
433 /* No new data read in this time round */
434
435 if(n==0)
436 break;
437
438 /* Sort the data pointers using a heap sort */
439
440 filesort_heapsort(datap,n,compare);
441
442 /* Shortcut if all read in and sorted at once */
443
444 if(nfiles==0 && !more)
445 {
446 for(i=0;i<n;i++)
447 {
448 if(!buildindex || buildindex(datap[i],count))
449 {
450 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
451
452 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
453 count++;
454 }
455 }
456
457 goto tidy_and_exit;
458 }
459
460 /* Create a temporary file and write the result */
461
462 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
463
464 fd=OpenFileNew(filename);
465
466 for(i=0;i<n;i++)
467 {
468 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
469
470 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
471 }
472
473 CloseFile(fd);
474
475 nfiles++;
476 }
477 while(more);
478
479 /* Check that number of files is less than file size */
480
481 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
482
483 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
484
485 /* Open all of the temporary files */
486
487 fds=(int*)malloc(nfiles*sizeof(int));
488
489 for(i=0;i<nfiles;i++)
490 {
491 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
492
493 fds[i]=ReOpenFile(filename);
494
495 DeleteFile(filename);
496 }
497
498 /* Perform an n-way merge using a binary heap */
499
500 heap=(int*)malloc(nfiles*sizeof(int));
501
502 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
503
504 /* Fill the heap to start with */
505
506 for(i=0;i<nfiles;i++)
507 {
508 int index;
509 FILESORT_VARINT itemsize;
510
511 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
512
513 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
514
515 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
516
517 ReadFile(fds[i],datap[i],itemsize);
518
519 index=i;
520
521 heap[index]=i;
522
523 /* Bubble up the new value */
524
525 while(index>0)
526 {
527 int newindex;
528 int temp;
529
530 newindex=(index-1)/4;
531
532 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
533 break;
534
535 temp=heap[index];
536 heap[index]=heap[newindex];
537 heap[newindex]=temp;
538
539 index=newindex;
540 }
541 }
542
543 /* Repeatedly pull out the root of the heap and refill from the same file */
544
545 ndata=nfiles;
546
547 do
548 {
549 int index=0;
550 FILESORT_VARINT itemsize;
551
552 if(!buildindex || buildindex(datap[heap[index]],count))
553 {
554 itemsize=*(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE);
555
556 WriteFile(fd_out,datap[heap[index]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
557 count++;
558 }
559
560 if(ReadFile(fds[heap[index]],&itemsize,FILESORT_VARSIZE))
561 {
562 ndata--;
563 heap[index]=heap[ndata];
564 }
565 else
566 {
567 *(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE)=itemsize;
568
569 ReadFile(fds[heap[index]],datap[heap[index]],itemsize);
570 }
571
572 /* Bubble down the new value */
573
574 while((4*index+4)<ndata)
575 {
576 int childindex,newindex;
577 int temp;
578
579 childindex=newindex=4*index+1;
580
581 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
582 newindex=childindex;
583 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
584 newindex=childindex;
585 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
586 newindex=childindex;
587
588 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
589 break;
590
591 temp=heap[newindex];
592 heap[newindex]=heap[index];
593 heap[index]=temp;
594
595 index=newindex;
596 }
597
598 if((4*index+4)==ndata)
599 {
600 int childindex,newindex;
601 int temp;
602
603 childindex=newindex=4*index+1;
604
605 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
606 newindex=childindex;
607 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
608 newindex=childindex;
609
610 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
611 ; /* break */
612 else
613 {
614 temp=heap[newindex];
615 heap[newindex]=heap[index];
616 heap[index]=temp;
617 }
618 }
619
620 else if((4*index+3)==ndata)
621 {
622 int childindex,newindex;
623 int temp;
624
625 childindex=newindex=4*index+1;
626
627 if(compare(datap[heap[newindex]],datap[heap[++childindex]])>0)
628 newindex=childindex;
629
630 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
631 ; /* break */
632 else
633 {
634 temp=heap[newindex];
635 heap[newindex]=heap[index];
636 heap[index]=temp;
637 }
638 }
639
640 else if((4*index+2)==ndata)
641 {
642 int newindex;
643 int temp;
644
645 newindex=4*index+1;
646
647 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
648 ; /* break */
649 else
650 {
651 temp=heap[newindex];
652 heap[newindex]=heap[index];
653 heap[index]=temp;
654 }
655 }
656 }
657 while(ndata>0);
658
659 /* Tidy up */
660
661 tidy_and_exit:
662
663 if(fds)
664 {
665 for(i=0;i<nfiles;i++)
666 CloseFile(fds[i]);
667 free(fds);
668 }
669
670 if(heap)
671 free(heap);
672
673 free(data);
674 free(filename);
675 }
676
677
678 /*++++++++++++++++++++++++++++++++++++++
679 A function to sort an array of pointers efficiently.
680
681 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
682 in particular an this good because it can operate in-place and doesn't
683 allocate more memory like using qsort() does.
684
685 void **datap A pointer to the array of pointers to sort.
686
687 size_t nitems The number of items of data to sort.
688
689 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
690 data to be sorted was an array of things not pointers).
691 ++++++++++++++++++++++++++++++++++++++*/
692
693 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
694 {
695 int i;
696
697 /* Fill the heap by pretending to insert the data that is already there */
698
699 for(i=1;i<nitems;i++)
700 {
701 int index=i;
702
703 /* Bubble up the new value (upside-down, put largest at top) */
704
705 while(index>0)
706 {
707 int newindex;
708 void *temp;
709
710 newindex=(index-1)/4;
711
712 if(compare(datap[index],datap[newindex])<=0) /* reversed compared to filesort_fixed() above */
713 break;
714
715 temp=datap[index];
716 datap[index]=datap[newindex];
717 datap[newindex]=temp;
718
719 index=newindex;
720 }
721 }
722
723 /* Repeatedly pull out the root of the heap and swap with the bottom item */
724
725 for(i=nitems-1;i>0;i--)
726 {
727 int index=0;
728 void *temp;
729
730 temp=datap[index];
731 datap[index]=datap[i];
732 datap[i]=temp;
733
734 /* Bubble down the new value (upside-down, put largest at top) */
735
736 while((4*index+4)<i)
737 {
738 int childindex,newindex;
739 void *temp;
740
741 childindex=newindex=4*index+1;
742
743 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
744 newindex=childindex;
745 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
746 newindex=childindex;
747 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
748 newindex=childindex;
749
750 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
751 break;
752
753 temp=datap[newindex];
754 datap[newindex]=datap[index];
755 datap[index]=temp;
756
757 index=newindex;
758 }
759
760 if((4*index+4)==i)
761 {
762 int childindex,newindex;
763 void *temp;
764
765 childindex=newindex=4*index+1;
766
767 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
768 newindex=childindex;
769 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
770 newindex=childindex;
771
772 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
773 ; /* break */
774 else
775 {
776 temp=datap[newindex];
777 datap[newindex]=datap[index];
778 datap[index]=temp;
779 }
780 }
781
782 else if((4*index+3)==i)
783 {
784 int childindex,newindex;
785 void *temp;
786
787 childindex=newindex=4*index+1;
788
789 if(compare(datap[newindex],datap[++childindex])<0) /* reversed compared to filesort_fixed() above */
790 newindex=childindex;
791
792 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
793 ; /* break */
794 else
795 {
796 temp=datap[newindex];
797 datap[newindex]=datap[index];
798 datap[index]=temp;
799 }
800 }
801
802 else if((4*index+2)==i)
803 {
804 int newindex;
805 void *temp;
806
807 newindex=4*index+1;
808
809 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
810 ; /* break */
811 else
812 {
813 temp=datap[newindex];
814 datap[newindex]=datap[index];
815 datap[index]=temp;
816 }
817 }
818 }
819 }

Properties

Name Value
cvs:description Functions to perform sorting.