Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 873 - (show annotations) (download) (as text)
Sat Oct 22 14:04:52 2011 UTC (13 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 16126 byte(s)
Revert back to r864 zero-based binary heap but with r868/r869 refactored code.

1 /***************************************
2 Merge sort functions.
3
4 Part of the Routino routing software.
5 ******************/ /******************
6 This file Copyright 2009-2011 Andrew M. Bishop
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ***************************************/
21
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <assert.h>
27
28 #include "types.h"
29
30 #include "files.h"
31 #include "sorting.h"
32
33
34 /* Global variables */
35
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
38
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
41
42
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
46
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
52
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54
55 int fd_out The file descriptor of the output file (opened for writing and empty).
56
57 size_t itemsize The size of each item in the file that needs sorting.
58
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
61
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
65
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 {
68 int *fds=NULL,*heap=NULL;
69 int nfiles=0,ndata=0;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
73 char *filename;
74 int i,more=1;
75
76 /* Allocate the RAM buffer and other bits */
77
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
80
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
84
85 do
86 {
87 int fd,n=0;
88
89 /* Read in the data and create pointers */
90
91 for(i=0;i<nitems;i++)
92 {
93 datap[i]=data+i*itemsize;
94
95 if(ReadFile(fd_in,datap[i],itemsize))
96 {
97 more=0;
98 break;
99 }
100
101 total++;
102 }
103
104 n=i;
105
106 /* Shortcut if there is no data and no previous files (i.e. no data at all) */
107
108 if(nfiles==0 && n==0)
109 goto tidy_and_exit;
110
111 /* No new data read in this time round */
112
113 if(n==0)
114 break;
115
116 /* Sort the data pointers using a heap sort */
117
118 filesort_heapsort(datap,n,compare);
119
120 /* Shortcut if all read in and sorted at once */
121
122 if(nfiles==0 && !more)
123 {
124 for(i=0;i<n;i++)
125 {
126 if(!buildindex || buildindex(datap[i],count))
127 {
128 WriteFile(fd_out,datap[i],itemsize);
129 count++;
130 }
131 }
132
133 goto tidy_and_exit;
134 }
135
136 /* Create a temporary file and write the result */
137
138 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
139
140 fd=OpenFileNew(filename);
141
142 for(i=0;i<n;i++)
143 WriteFile(fd,datap[i],itemsize);
144
145 CloseFile(fd);
146
147 nfiles++;
148 }
149 while(more);
150
151 /* Shortcut if only one file (unlucky for us there must have been exactly
152 nitems, lucky for us we still have the data in RAM) */
153
154 if(nfiles==1)
155 {
156 for(i=0;i<nitems;i++)
157 {
158 if(!buildindex || buildindex(datap[i],count))
159 {
160 WriteFile(fd_out,datap[i],itemsize);
161 count++;
162 }
163 }
164
165 DeleteFile(filename);
166
167 goto tidy_and_exit;
168 }
169
170 /* Check that number of files is less than file size */
171
172 assert(nfiles<nitems);
173
174 /* Open all of the temporary files */
175
176 fds=(int*)malloc(nfiles*sizeof(int));
177
178 for(i=0;i<nfiles;i++)
179 {
180 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
181
182 fds[i]=ReOpenFile(filename);
183
184 DeleteFile(filename);
185 }
186
187 /* Perform an n-way merge using a binary heap */
188
189 heap=(int*)malloc(nfiles*sizeof(int));
190
191 /* Fill the heap to start with */
192
193 for(i=0;i<nfiles;i++)
194 {
195 int index;
196
197 datap[i]=data+i*itemsize;
198
199 ReadFile(fds[i],datap[i],itemsize);
200
201 index=i;
202 heap[index]=i;
203
204 /* Bubble up the new value */
205
206 while(index>0)
207 {
208 int newindex;
209 int temp;
210
211 newindex=(index-1)/2;
212
213 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
214 break;
215
216 temp=heap[index];
217 heap[index]=heap[newindex];
218 heap[newindex]=temp;
219
220 index=newindex;
221 }
222 }
223
224 /* Repeatedly pull out the root of the heap and refill from the same file */
225
226 ndata=nfiles;
227
228 do
229 {
230 int index=0;
231
232 if(!buildindex || buildindex(datap[heap[index]],count))
233 {
234 WriteFile(fd_out,datap[heap[index]],itemsize);
235 count++;
236 }
237
238 if(ReadFile(fds[heap[index]],datap[heap[index]],itemsize))
239 {
240 ndata--;
241 heap[index]=heap[ndata];
242 }
243
244 /* Bubble down the new value */
245
246 while((2*index+2)<ndata)
247 {
248 int newindex;
249 int temp;
250
251 newindex=2*index+1;
252
253 if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>0)
254 newindex=newindex+1;
255
256 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
257 break;
258
259 temp=heap[newindex];
260 heap[newindex]=heap[index];
261 heap[index]=temp;
262
263 index=newindex;
264 }
265
266 if((2*index+2)==ndata)
267 {
268 int newindex;
269 int temp;
270
271 newindex=2*index+1;
272
273 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
274 ; /* break */
275 else
276 {
277 temp=heap[newindex];
278 heap[newindex]=heap[index];
279 heap[index]=temp;
280 }
281 }
282 }
283 while(ndata>0);
284
285 /* Tidy up */
286
287 tidy_and_exit:
288
289 if(fds)
290 {
291 for(i=0;i<nfiles;i++)
292 CloseFile(fds[i]);
293 free(fds);
294 }
295
296 if(heap)
297 free(heap);
298
299 free(data);
300 free(datap);
301 free(filename);
302 }
303
304
305 /*++++++++++++++++++++++++++++++++++++++
306 A function to sort the contents of a file of variable length objects (each
307 preceded by its length in FILESORT_VARSIZE bytes) using a limited amount of RAM.
308
309 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
310 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
311 The individual sort steps and the merge step both use a "Heap sort"
312 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
313 if the data is already partially sorted.
314
315 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
316
317 int fd_out The file descriptor of the output file (opened for writing and empty).
318
319 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
320 data to be sorted is an array of things not pointers).
321
322 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
323 returns 1 then it is written to the output file.
324 ++++++++++++++++++++++++++++++++++++++*/
325
326 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
327 {
328 int *fds=NULL,*heap=NULL;
329 int nfiles=0,ndata=0;
330 index_t count=0,total=0;
331 FILESORT_VARINT nextitemsize,largestitemsize=0;
332 void *data=NULL,**datap=NULL;
333 char *filename;
334 int i,more=1;
335
336 /* Allocate the RAM buffer and other bits */
337
338 data=malloc(option_filesort_ramsize);
339
340 filename=(char*)malloc(strlen(option_tmpdirname)+24);
341
342 /* Loop around, fill the buffer, sort the data and write a temporary file */
343
344 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
345 goto tidy_and_exit;
346
347 do
348 {
349 int fd,n=0;
350 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
351
352 datap=data+option_filesort_ramsize;
353
354 /* Read in the data and create pointers */
355
356 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
357 {
358 FILESORT_VARINT itemsize=nextitemsize;
359
360 if(itemsize>largestitemsize)
361 largestitemsize=itemsize;
362
363 *(FILESORT_VARINT*)(data+ramused)=itemsize;
364
365 ramused+=FILESORT_VARSIZE;
366
367 ReadFile(fd_in,data+ramused,itemsize);
368
369 *--datap=data+ramused; /* points to real data */
370
371 ramused+=itemsize;
372
373 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
374 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
375
376 total++;
377 n++;
378
379 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
380 {
381 more=0;
382 break;
383 }
384 }
385
386 /* No new data read in this time round */
387
388 if(n==0)
389 break;
390
391 /* Sort the data pointers using a heap sort */
392
393 filesort_heapsort(datap,n,compare);
394
395 /* Shortcut if all read in and sorted at once */
396
397 if(nfiles==0 && !more)
398 {
399 for(i=0;i<n;i++)
400 {
401 if(!buildindex || buildindex(datap[i],count))
402 {
403 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
404
405 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
406 count++;
407 }
408 }
409
410 goto tidy_and_exit;
411 }
412
413 /* Create a temporary file and write the result */
414
415 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
416
417 fd=OpenFileNew(filename);
418
419 for(i=0;i<n;i++)
420 {
421 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
422
423 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
424 }
425
426 CloseFile(fd);
427
428 nfiles++;
429 }
430 while(more);
431
432 /* Check that number of files is less than file size */
433
434 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
435
436 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
437
438 /* Open all of the temporary files */
439
440 fds=(int*)malloc(nfiles*sizeof(int));
441
442 for(i=0;i<nfiles;i++)
443 {
444 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
445
446 fds[i]=ReOpenFile(filename);
447
448 DeleteFile(filename);
449 }
450
451 /* Perform an n-way merge using a binary heap */
452
453 heap=(int*)malloc(nfiles*sizeof(int));
454
455 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
456
457 /* Fill the heap to start with */
458
459 for(i=0;i<nfiles;i++)
460 {
461 int index;
462 FILESORT_VARINT itemsize;
463
464 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
465
466 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
467
468 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
469
470 ReadFile(fds[i],datap[i],itemsize);
471
472 index=i;
473 heap[index]=i;
474
475 /* Bubble up the new value */
476
477 while(index>0)
478 {
479 int newindex;
480 int temp;
481
482 newindex=(index-1)/2;
483
484 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
485 break;
486
487 temp=heap[index];
488 heap[index]=heap[newindex];
489 heap[newindex]=temp;
490
491 index=newindex;
492 }
493 }
494
495 /* Repeatedly pull out the root of the heap and refill from the same file */
496
497 ndata=nfiles;
498
499 do
500 {
501 int index=0;
502 FILESORT_VARINT itemsize;
503
504 if(!buildindex || buildindex(datap[heap[index]],count))
505 {
506 itemsize=*(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE);
507
508 WriteFile(fd_out,datap[heap[index]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
509 count++;
510 }
511
512 if(ReadFile(fds[heap[index]],&itemsize,FILESORT_VARSIZE))
513 {
514 ndata--;
515 heap[index]=heap[ndata];
516 }
517 else
518 {
519 *(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE)=itemsize;
520
521 ReadFile(fds[heap[index]],datap[heap[index]],itemsize);
522 }
523
524 /* Bubble down the new value */
525
526 while((2*index+2)<ndata)
527 {
528 int newindex;
529 int temp;
530
531 newindex=2*index+1;
532
533 if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>0)
534 newindex=newindex+1;
535
536 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
537 break;
538
539 temp=heap[newindex];
540 heap[newindex]=heap[index];
541 heap[index]=temp;
542
543 index=newindex;
544 }
545
546 if((2*index+2)==ndata)
547 {
548 int newindex;
549 int temp;
550
551 newindex=2*index+1;
552
553 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
554 ; /* break */
555 else
556 {
557 temp=heap[newindex];
558 heap[newindex]=heap[index];
559 heap[index]=temp;
560 }
561 }
562 }
563 while(ndata>0);
564
565 /* Tidy up */
566
567 tidy_and_exit:
568
569 if(fds)
570 {
571 for(i=0;i<nfiles;i++)
572 CloseFile(fds[i]);
573 free(fds);
574 }
575
576 if(heap)
577 free(heap);
578
579 free(data);
580 free(filename);
581 }
582
583
584 /*++++++++++++++++++++++++++++++++++++++
585 A function to sort an array of pointers efficiently.
586
587 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
588 in particular, this is good because it can operate in-place and doesn't
589 allocate more memory like using qsort() does.
590
591 void **datap A pointer to the array of pointers to sort.
592
593 size_t nitems The number of items of data to sort.
594
595 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
596 data to be sorted was an array of things not pointers).
597 ++++++++++++++++++++++++++++++++++++++*/
598
599 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
600 {
601 int i;
602
603 /* Fill the heap by pretending to insert the data that is already there */
604
605 for(i=1;i<nitems;i++)
606 {
607 int index=i;
608
609 /* Bubble up the new value (upside-down, put largest at top) */
610
611 while(index>0)
612 {
613 int newindex;
614 void *temp;
615
616 newindex=(index-1)/2;
617
618 if(compare(datap[index],datap[newindex])<=0) /* reversed compared to filesort_fixed() above */
619 break;
620
621 temp=datap[index];
622 datap[index]=datap[newindex];
623 datap[newindex]=temp;
624
625 index=newindex;
626 }
627 }
628
629 /* Repeatedly pull out the root of the heap and swap with the bottom item */
630
631 for(i=nitems-1;i>0;i--)
632 {
633 int index=0;
634 void *temp;
635
636 temp=datap[index];
637 datap[index]=datap[i];
638 datap[i]=temp;
639
640 /* Bubble down the new value (upside-down, put largest at top) */
641
642 while((2*index+2)<i)
643 {
644 int newindex;
645 void *temp;
646
647 newindex=2*index+1;
648
649 if(compare(datap[newindex],datap[newindex+1])<0) /* reversed compared to filesort_fixed() above */
650 newindex=newindex+1;
651
652 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
653 break;
654
655 temp=datap[newindex];
656 datap[newindex]=datap[index];
657 datap[index]=temp;
658
659 index=newindex;
660 }
661
662 if((2*index+2)==i)
663 {
664 int newindex;
665 void *temp;
666
667 newindex=2*index+1;
668
669 if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */
670 ; /* break */
671 else
672 {
673 temp=datap[newindex];
674 datap[newindex]=datap[index];
675 datap[index]=temp;
676 }
677 }
678 }
679 }

Properties

Name Value
cvs:description Functions to perform sorting.