Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 680 - (show annotations) (download) (as text)
Sun Apr 24 15:14:53 2011 UTC (13 years, 11 months ago) by amb
File MIME type: text/x-csrc
File size: 16013 byte(s)
Update comments throughout the source code.

1 /***************************************
2 Merge sort functions.
3
4 Part of the Routino routing software.
5 ******************/ /******************
6 This file Copyright 2009-2011 Andrew M. Bishop
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ***************************************/
21
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <assert.h>
27
28 #include "types.h"
29
30 #include "files.h"
31 #include "sorting.h"
32
33
34 /* Global variables */
35
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
38
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
41
42
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
46
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
52
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54
55 int fd_out The file descriptor of the output file (opened for writing and empty).
56
57 size_t itemsize The size of each item in the file that needs sorting.
58
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
61
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
65
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 {
68 int *fds=NULL,*heap=NULL;
69 int nfiles=0,ndata=0;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
73 char *filename;
74 int i,more=1;
75
76 /* Allocate the RAM buffer and other bits */
77
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
80
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
84
85 do
86 {
87 int fd,n=0;
88
89 /* Read in the data and create pointers */
90
91 for(i=0;i<nitems;i++)
92 {
93 datap[i]=data+i*itemsize;
94
95 if(ReadFile(fd_in,datap[i],itemsize))
96 {
97 more=0;
98 break;
99 }
100
101 total++;
102 }
103
104 n=i;
105
106 /* Shortcut if there is no data and no previous files (i.e. no data at all) */
107
108 if(nfiles==0 && n==0)
109 goto tidy_and_exit;
110
111 /* No new data read in this time round */
112
113 if(n==0)
114 break;
115
116 /* Sort the data pointers using a heap sort */
117
118 filesort_heapsort(datap,n,compare);
119
120 /* Shortcut if all read in and sorted at once */
121
122 if(nfiles==0 && !more)
123 {
124 for(i=0;i<n;i++)
125 {
126 if(!buildindex || buildindex(datap[i],count))
127 {
128 WriteFile(fd_out,datap[i],itemsize);
129 count++;
130 }
131 }
132
133 goto tidy_and_exit;
134 }
135
136 /* Create a temporary file and write the result */
137
138 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
139
140 fd=OpenFileNew(filename);
141
142 for(i=0;i<n;i++)
143 WriteFile(fd,datap[i],itemsize);
144
145 CloseFile(fd);
146
147 nfiles++;
148 }
149 while(more);
150
151 /* Shortcut if only one file (unlucky for us there must have been exactly
152 nitems, lucky for us we still have the data in RAM) */
153
154 if(nfiles==1)
155 {
156 for(i=0;i<nitems;i++)
157 {
158 if(!buildindex || buildindex(datap[i],count))
159 {
160 WriteFile(fd_out,datap[i],itemsize);
161 count++;
162 }
163 }
164
165 DeleteFile(filename);
166
167 goto tidy_and_exit;
168 }
169
170 /* Check that number of files is less than file size */
171
172 assert(nfiles<nitems);
173
174 /* Open all of the temporary files */
175
176 fds=(int*)malloc(nfiles*sizeof(int));
177
178 for(i=0;i<nfiles;i++)
179 {
180 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
181
182 fds[i]=ReOpenFile(filename);
183
184 DeleteFile(filename);
185 }
186
187 /* Perform an n-way merge using a binary heap */
188
189 heap=(int*)malloc(nfiles*sizeof(int));
190
191 /* Fill the heap to start with */
192
193 for(i=0;i<nfiles;i++)
194 {
195 int index;
196
197 datap[i]=data+i*itemsize;
198
199 ReadFile(fds[i],datap[i],itemsize);
200
201 heap[i]=i;
202
203 index=i;
204
205 /* Bubble up the new value */
206
207 while(index>0 &&
208 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
209 {
210 int newindex;
211 int temp;
212
213 newindex=(index-1)/2;
214
215 temp=heap[index];
216 heap[index]=heap[newindex];
217 heap[newindex]=temp;
218
219 index=newindex;
220 }
221 }
222
223 /* Repeatedly pull out the root of the heap and refill from the same file */
224
225 ndata=nfiles;
226
227 do
228 {
229 int index=0;
230
231 if(!buildindex || buildindex(datap[heap[0]],count))
232 {
233 WriteFile(fd_out,datap[heap[0]],itemsize);
234 count++;
235 }
236
237 if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize))
238 {
239 ndata--;
240 heap[0]=heap[ndata];
241 }
242
243 /* Bubble down the new value */
244
245 while((2*index+2)<ndata &&
246 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
247 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
248 {
249 int newindex;
250 int temp;
251
252 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
253 newindex=2*index+1;
254 else
255 newindex=2*index+2;
256
257 temp=heap[newindex];
258 heap[newindex]=heap[index];
259 heap[index]=temp;
260
261 index=newindex;
262 }
263
264 if((2*index+2)==ndata &&
265 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
266 {
267 int newindex;
268 int temp;
269
270 newindex=2*index+1;
271
272 temp=heap[newindex];
273 heap[newindex]=heap[index];
274 heap[index]=temp;
275 }
276 }
277 while(ndata>0);
278
279 /* Tidy up */
280
281 tidy_and_exit:
282
283 if(fds)
284 {
285 for(i=0;i<nfiles;i++)
286 CloseFile(fds[i]);
287 free(fds);
288 }
289
290 if(heap)
291 free(heap);
292
293 free(data);
294 free(datap);
295 free(filename);
296 }
297
298
299 /*++++++++++++++++++++++++++++++++++++++
300 A function to sort the contents of a file of variable length objects (each
301 preceded by its length in 2 bytes) using a limited amount of RAM.
302
303 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
304 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
305 The individual sort steps and the merge step both use a "Heap sort"
306 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
307 if the data is already partially sorted.
308
309 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
310
311 int fd_out The file descriptor of the output file (opened for writing and empty).
312
313 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
314 data to be sorted is an array of things not pointers).
315
316 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
317 returns 1 then it is written to the output file.
318 ++++++++++++++++++++++++++++++++++++++*/
319
320 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
321 {
322 int *fds=NULL,*heap=NULL;
323 int nfiles=0,ndata=0;
324 index_t count=0,total=0;
325 FILESORT_VARINT nextitemsize,largestitemsize=0;
326 void *data=NULL,**datap=NULL;
327 char *filename;
328 int i,more=1;
329
330 /* Allocate the RAM buffer and other bits */
331
332 data=malloc(option_filesort_ramsize);
333
334 filename=(char*)malloc(strlen(option_tmpdirname)+24);
335
336 /* Loop around, fill the buffer, sort the data and write a temporary file */
337
338 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
339 goto tidy_and_exit;
340
341 do
342 {
343 int fd,n=0;
344 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
345
346 datap=data+option_filesort_ramsize;
347
348 /* Read in the data and create pointers */
349
350 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
351 {
352 FILESORT_VARINT itemsize=nextitemsize;
353
354 if(itemsize>largestitemsize)
355 largestitemsize=itemsize;
356
357 *(FILESORT_VARINT*)(data+ramused)=itemsize;
358
359 ramused+=FILESORT_VARSIZE;
360
361 ReadFile(fd_in,data+ramused,itemsize);
362
363 *--datap=data+ramused; /* points to real data */
364
365 ramused+=itemsize;
366
367 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
368 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
369
370 total++;
371 n++;
372
373 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
374 {
375 more=0;
376 break;
377 }
378 }
379
380 /* No new data read in this time round */
381
382 if(n==0)
383 break;
384
385 /* Sort the data pointers using a heap sort */
386
387 filesort_heapsort(datap,n,compare);
388
389 /* Shortcut if all read in and sorted at once */
390
391 if(nfiles==0 && !more)
392 {
393 for(i=0;i<n;i++)
394 {
395 if(!buildindex || buildindex(datap[i],count))
396 {
397 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
398
399 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
400 count++;
401 }
402 }
403
404 goto tidy_and_exit;
405 }
406
407 /* Create a temporary file and write the result */
408
409 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
410
411 fd=OpenFileNew(filename);
412
413 for(i=0;i<n;i++)
414 {
415 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
416
417 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
418 }
419
420 CloseFile(fd);
421
422 nfiles++;
423 }
424 while(more);
425
426 /* Check that number of files is less than file size */
427
428 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
429
430 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
431
432 /* Open all of the temporary files */
433
434 fds=(int*)malloc(nfiles*sizeof(int));
435
436 for(i=0;i<nfiles;i++)
437 {
438 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
439
440 fds[i]=ReOpenFile(filename);
441
442 DeleteFile(filename);
443 }
444
445 /* Perform an n-way merge using a binary heap */
446
447 heap=(int*)malloc(nfiles*sizeof(int));
448
449 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
450
451 /* Fill the heap to start with */
452
453 for(i=0;i<nfiles;i++)
454 {
455 int index;
456 FILESORT_VARINT itemsize;
457
458 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
459
460 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
461
462 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
463
464 ReadFile(fds[i],datap[i],itemsize);
465
466 heap[i]=i;
467
468 index=i;
469
470 /* Bubble up the new value */
471
472 while(index>0 &&
473 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
474 {
475 int newindex;
476 int temp;
477
478 newindex=(index-1)/2;
479
480 temp=heap[index];
481 heap[index]=heap[newindex];
482 heap[newindex]=temp;
483
484 index=newindex;
485 }
486 }
487
488 /* Repeatedly pull out the root of the heap and refill from the same file */
489
490 ndata=nfiles;
491
492 do
493 {
494 int index=0;
495 FILESORT_VARINT itemsize;
496
497 if(!buildindex || buildindex(datap[heap[0]],count))
498 {
499 itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE);
500
501 WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
502 count++;
503 }
504
505 if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE))
506 {
507 ndata--;
508 heap[0]=heap[ndata];
509 }
510 else
511 {
512 *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize;
513
514 ReadFile(fds[heap[0]],datap[heap[0]],itemsize);
515 }
516
517 /* Bubble down the new value */
518
519 while((2*index+2)<ndata &&
520 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
521 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
522 {
523 int newindex;
524 int temp;
525
526 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
527 newindex=2*index+1;
528 else
529 newindex=2*index+2;
530
531 temp=heap[newindex];
532 heap[newindex]=heap[index];
533 heap[index]=temp;
534
535 index=newindex;
536 }
537
538 if((2*index+2)==ndata &&
539 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
540 {
541 int newindex;
542 int temp;
543
544 newindex=2*index+1;
545
546 temp=heap[newindex];
547 heap[newindex]=heap[index];
548 heap[index]=temp;
549 }
550 }
551 while(ndata>0);
552
553 /* Tidy up */
554
555 tidy_and_exit:
556
557 if(fds)
558 {
559 for(i=0;i<nfiles;i++)
560 CloseFile(fds[i]);
561 free(fds);
562 }
563
564 if(heap)
565 free(heap);
566
567 free(data);
568 free(filename);
569 }
570
571
572 /*++++++++++++++++++++++++++++++++++++++
573 A function to sort an array of pointers efficiently.
574
575 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
576 in particular an this good because it can operate in-place and doesn't
577 allocate more memory like using qsort() does.
578
579 void **datap A pointer to the array of pointers to sort.
580
581 size_t nitems The number of items of data to sort.
582
583 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
584 data to be sorted was an array of things not pointers).
585 ++++++++++++++++++++++++++++++++++++++*/
586
587 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
588 {
589 int i;
590
591 /* Fill the heap by pretending to insert the data that is already there */
592
593 for(i=1;i<nitems;i++)
594 {
595 int index=i;
596
597 /* Bubble up the new value (upside-down, put largest at top) */
598
599 while(index>0 &&
600 compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */
601 {
602 int newindex;
603 void *temp;
604
605 newindex=(index-1)/2;
606
607 temp=datap[index];
608 datap[index]=datap[newindex];
609 datap[newindex]=temp;
610
611 index=newindex;
612 }
613 }
614
615 /* Repeatedly pull out the root of the heap and swap with the bottom item */
616
617 for(i=nitems-1;i>0;i--)
618 {
619 int index=0;
620 void *temp;
621
622 temp=datap[index];
623 datap[index]=datap[i];
624 datap[i]=temp;
625
626 /* Bubble down the new value (upside-down, put largest at top) */
627
628 while((2*index+2)<i &&
629 (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */
630 compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */
631 {
632 int newindex;
633 void *temp;
634
635 if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */
636 newindex=2*index+1;
637 else
638 newindex=2*index+2;
639
640 temp=datap[newindex];
641 datap[newindex]=datap[index];
642 datap[index]=temp;
643
644 index=newindex;
645 }
646
647 if((2*index+2)==i &&
648 compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */
649 {
650 int newindex;
651 void *temp;
652
653 newindex=2*index+1;
654
655 temp=datap[newindex];
656 datap[newindex]=datap[index];
657 datap[index]=temp;
658 }
659 }
660 }

Properties

Name Value
cvs:description Functions to perform sorting.