Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 875 - (show annotations) (download) (as text)
Sat Oct 22 16:27:29 2011 UTC (13 years, 4 months ago) by amb
File MIME type: text/x-csrc
File size: 16165 byte(s)
Revert back to something very close to r869 because it is fastest by a tiny
fraction.

1 /***************************************
2 Merge sort functions.
3
4 Part of the Routino routing software.
5 ******************/ /******************
6 This file Copyright 2009-2011 Andrew M. Bishop
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ***************************************/
21
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <assert.h>
27
28 #include "types.h"
29
30 #include "files.h"
31 #include "sorting.h"
32
33
34 /* Global variables */
35
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
38
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
41
42
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
46
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
52
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54
55 int fd_out The file descriptor of the output file (opened for writing and empty).
56
57 size_t itemsize The size of each item in the file that needs sorting.
58
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
61
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
65
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 {
68 int *fds=NULL,*heap=NULL;
69 int nfiles=0,ndata=0;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
73 char *filename;
74 int i,more=1;
75
76 /* Allocate the RAM buffer and other bits */
77
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
80
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
84
85 do
86 {
87 int fd,n=0;
88
89 /* Read in the data and create pointers */
90
91 for(i=0;i<nitems;i++)
92 {
93 datap[i]=data+i*itemsize;
94
95 if(ReadFile(fd_in,datap[i],itemsize))
96 {
97 more=0;
98 break;
99 }
100
101 total++;
102 }
103
104 n=i;
105
106 /* Shortcut if there is no data and no previous files (i.e. no data at all) */
107
108 if(nfiles==0 && n==0)
109 goto tidy_and_exit;
110
111 /* No new data read in this time round */
112
113 if(n==0)
114 break;
115
116 /* Sort the data pointers using a heap sort */
117
118 filesort_heapsort(datap,n,compare);
119
120 /* Shortcut if all read in and sorted at once */
121
122 if(nfiles==0 && !more)
123 {
124 for(i=0;i<n;i++)
125 {
126 if(!buildindex || buildindex(datap[i],count))
127 {
128 WriteFile(fd_out,datap[i],itemsize);
129 count++;
130 }
131 }
132
133 goto tidy_and_exit;
134 }
135
136 /* Create a temporary file and write the result */
137
138 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
139
140 fd=OpenFileNew(filename);
141
142 for(i=0;i<n;i++)
143 WriteFile(fd,datap[i],itemsize);
144
145 CloseFile(fd);
146
147 nfiles++;
148 }
149 while(more);
150
151 /* Shortcut if only one file (unlucky for us there must have been exactly
152 nitems, lucky for us we still have the data in RAM) */
153
154 if(nfiles==1)
155 {
156 for(i=0;i<nitems;i++)
157 {
158 if(!buildindex || buildindex(datap[i],count))
159 {
160 WriteFile(fd_out,datap[i],itemsize);
161 count++;
162 }
163 }
164
165 DeleteFile(filename);
166
167 goto tidy_and_exit;
168 }
169
170 /* Check that number of files is less than file size */
171
172 assert(nfiles<nitems);
173
174 /* Open all of the temporary files */
175
176 fds=(int*)malloc(nfiles*sizeof(int));
177
178 for(i=0;i<nfiles;i++)
179 {
180 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
181
182 fds[i]=ReOpenFile(filename);
183
184 DeleteFile(filename);
185 }
186
187 /* Perform an n-way merge using a binary heap */
188
189 heap=(int*)malloc((1+nfiles)*sizeof(int));
190
191 /* Fill the heap to start with */
192
193 for(i=0;i<nfiles;i++)
194 {
195 int index;
196
197 datap[i]=data+i*itemsize;
198
199 ReadFile(fds[i],datap[i],itemsize);
200
201 index=i+1;
202
203 heap[index]=i;
204
205 /* Bubble up the new value */
206
207 while(index>1)
208 {
209 int newindex;
210 int temp;
211
212 newindex=index/2;
213
214 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
215 break;
216
217 temp=heap[index];
218 heap[index]=heap[newindex];
219 heap[newindex]=temp;
220
221 index=newindex;
222 }
223 }
224
225 /* Repeatedly pull out the root of the heap and refill from the same file */
226
227 ndata=nfiles;
228
229 do
230 {
231 int index=1;
232
233 if(!buildindex || buildindex(datap[heap[index]],count))
234 {
235 WriteFile(fd_out,datap[heap[index]],itemsize);
236 count++;
237 }
238
239 if(ReadFile(fds[heap[index]],datap[heap[index]],itemsize))
240 {
241 heap[index]=heap[ndata];
242 ndata--;
243 }
244
245 /* Bubble down the new value */
246
247 while((2*index)<ndata)
248 {
249 int newindex;
250 int temp;
251
252 newindex=2*index;
253
254 if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>=0)
255 newindex=newindex+1;
256
257 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
258 break;
259
260 temp=heap[newindex];
261 heap[newindex]=heap[index];
262 heap[index]=temp;
263
264 index=newindex;
265 }
266
267 if((2*index)==ndata)
268 {
269 int newindex;
270 int temp;
271
272 newindex=2*index;
273
274 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
275 ; /* break */
276 else
277 {
278 temp=heap[newindex];
279 heap[newindex]=heap[index];
280 heap[index]=temp;
281 }
282 }
283 }
284 while(ndata>0);
285
286 /* Tidy up */
287
288 tidy_and_exit:
289
290 if(fds)
291 {
292 for(i=0;i<nfiles;i++)
293 CloseFile(fds[i]);
294 free(fds);
295 }
296
297 if(heap)
298 free(heap);
299
300 free(data);
301 free(datap);
302 free(filename);
303 }
304
305
306 /*++++++++++++++++++++++++++++++++++++++
307 A function to sort the contents of a file of variable length objects (each
308 preceded by its length in FILESORT_VARSIZE bytes) using a limited amount of RAM.
309
310 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
311 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
312 The individual sort steps and the merge step both use a "Heap sort"
313 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
314 if the data is already partially sorted.
315
316 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
317
318 int fd_out The file descriptor of the output file (opened for writing and empty).
319
320 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
321 data to be sorted is an array of things not pointers).
322
323 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
324 returns 1 then it is written to the output file.
325 ++++++++++++++++++++++++++++++++++++++*/
326
327 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
328 {
329 int *fds=NULL,*heap=NULL;
330 int nfiles=0,ndata=0;
331 index_t count=0,total=0;
332 FILESORT_VARINT nextitemsize,largestitemsize=0;
333 void *data=NULL,**datap=NULL;
334 char *filename;
335 int i,more=1;
336
337 /* Allocate the RAM buffer and other bits */
338
339 data=malloc(option_filesort_ramsize);
340
341 filename=(char*)malloc(strlen(option_tmpdirname)+24);
342
343 /* Loop around, fill the buffer, sort the data and write a temporary file */
344
345 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
346 goto tidy_and_exit;
347
348 do
349 {
350 int fd,n=0;
351 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
352
353 datap=data+option_filesort_ramsize;
354
355 /* Read in the data and create pointers */
356
357 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
358 {
359 FILESORT_VARINT itemsize=nextitemsize;
360
361 if(itemsize>largestitemsize)
362 largestitemsize=itemsize;
363
364 *(FILESORT_VARINT*)(data+ramused)=itemsize;
365
366 ramused+=FILESORT_VARSIZE;
367
368 ReadFile(fd_in,data+ramused,itemsize);
369
370 *--datap=data+ramused; /* points to real data */
371
372 ramused+=itemsize;
373
374 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
375 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
376
377 total++;
378 n++;
379
380 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
381 {
382 more=0;
383 break;
384 }
385 }
386
387 /* No new data read in this time round */
388
389 if(n==0)
390 break;
391
392 /* Sort the data pointers using a heap sort */
393
394 filesort_heapsort(datap,n,compare);
395
396 /* Shortcut if all read in and sorted at once */
397
398 if(nfiles==0 && !more)
399 {
400 for(i=0;i<n;i++)
401 {
402 if(!buildindex || buildindex(datap[i],count))
403 {
404 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
405
406 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
407 count++;
408 }
409 }
410
411 goto tidy_and_exit;
412 }
413
414 /* Create a temporary file and write the result */
415
416 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
417
418 fd=OpenFileNew(filename);
419
420 for(i=0;i<n;i++)
421 {
422 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
423
424 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
425 }
426
427 CloseFile(fd);
428
429 nfiles++;
430 }
431 while(more);
432
433 /* Check that number of files is less than file size */
434
435 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
436
437 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
438
439 /* Open all of the temporary files */
440
441 fds=(int*)malloc(nfiles*sizeof(int));
442
443 for(i=0;i<nfiles;i++)
444 {
445 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
446
447 fds[i]=ReOpenFile(filename);
448
449 DeleteFile(filename);
450 }
451
452 /* Perform an n-way merge using a binary heap */
453
454 heap=(int*)malloc((1+nfiles)*sizeof(int));
455
456 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
457
458 /* Fill the heap to start with */
459
460 for(i=0;i<nfiles;i++)
461 {
462 int index;
463 FILESORT_VARINT itemsize;
464
465 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
466
467 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
468
469 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
470
471 ReadFile(fds[i],datap[i],itemsize);
472
473 index=i+1;
474
475 heap[index]=i;
476
477 /* Bubble up the new value */
478
479 while(index>1)
480 {
481 int newindex;
482 int temp;
483
484 newindex=index/2;
485
486 if(compare(datap[heap[index]],datap[heap[newindex]])>=0)
487 break;
488
489 temp=heap[index];
490 heap[index]=heap[newindex];
491 heap[newindex]=temp;
492
493 index=newindex;
494 }
495 }
496
497 /* Repeatedly pull out the root of the heap and refill from the same file */
498
499 ndata=nfiles;
500
501 do
502 {
503 int index=1;
504 FILESORT_VARINT itemsize;
505
506 if(!buildindex || buildindex(datap[heap[index]],count))
507 {
508 itemsize=*(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE);
509
510 WriteFile(fd_out,datap[heap[index]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
511 count++;
512 }
513
514 if(ReadFile(fds[heap[index]],&itemsize,FILESORT_VARSIZE))
515 {
516 heap[index]=heap[ndata];
517 ndata--;
518 }
519 else
520 {
521 *(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE)=itemsize;
522
523 ReadFile(fds[heap[index]],datap[heap[index]],itemsize);
524 }
525
526 /* Bubble down the new value */
527
528 while((2*index)<ndata)
529 {
530 int newindex;
531 int temp;
532
533 newindex=2*index;
534
535 if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>=0)
536 newindex=newindex+1;
537
538 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
539 break;
540
541 temp=heap[newindex];
542 heap[newindex]=heap[index];
543 heap[index]=temp;
544
545 index=newindex;
546 }
547
548 if((2*index)==ndata)
549 {
550 int newindex;
551 int temp;
552
553 newindex=2*index;
554
555 if(compare(datap[heap[index]],datap[heap[newindex]])<=0)
556 ; /* break */
557 else
558 {
559 temp=heap[newindex];
560 heap[newindex]=heap[index];
561 heap[index]=temp;
562 }
563 }
564 }
565 while(ndata>0);
566
567 /* Tidy up */
568
569 tidy_and_exit:
570
571 if(fds)
572 {
573 for(i=0;i<nfiles;i++)
574 CloseFile(fds[i]);
575 free(fds);
576 }
577
578 if(heap)
579 free(heap);
580
581 free(data);
582 free(filename);
583 }
584
585
586 /*++++++++++++++++++++++++++++++++++++++
587 A function to sort an array of pointers efficiently.
588
589 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
590 in particular, this is good because it can operate in-place and doesn't
591 allocate more memory like using qsort() does.
592
593 void **datap A pointer to the array of pointers to sort.
594
595 size_t nitems The number of items of data to sort.
596
597 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
598 data to be sorted was an array of things not pointers).
599 ++++++++++++++++++++++++++++++++++++++*/
600
601 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
602 {
603 void **datap1=&datap[-1];
604 int i;
605
606 /* Fill the heap by pretending to insert the data that is already there */
607
608 for(i=2;i<=nitems;i++)
609 {
610 int index=i;
611
612 /* Bubble up the new value (upside-down, put largest at top) */
613
614 while(index>1)
615 {
616 int newindex;
617 void *temp;
618
619 newindex=index/2;
620
621 if(compare(datap1[index],datap1[newindex])<=0) /* reversed compared to filesort_fixed() above */
622 break;
623
624 temp=datap1[index];
625 datap1[index]=datap1[newindex];
626 datap1[newindex]=temp;
627
628 index=newindex;
629 }
630 }
631
632 /* Repeatedly pull out the root of the heap and swap with the bottom item */
633
634 for(i=nitems;i>1;i--)
635 {
636 int index=1;
637 void *temp;
638
639 temp=datap1[index];
640 datap1[index]=datap1[i];
641 datap1[i]=temp;
642
643 /* Bubble down the new value (upside-down, put largest at top) */
644
645 while((2*index)<(i-1))
646 {
647 int newindex;
648 void *temp;
649
650 newindex=2*index;
651
652 if(compare(datap1[newindex],datap1[newindex+1])<=0) /* reversed compared to filesort_fixed() above */
653 newindex=newindex+1;
654
655 if(compare(datap1[index],datap1[newindex])>=0) /* reversed compared to filesort_fixed() above */
656 break;
657
658 temp=datap1[newindex];
659 datap1[newindex]=datap1[index];
660 datap1[index]=temp;
661
662 index=newindex;
663 }
664
665 if((2*index)==(i-1))
666 {
667 int newindex;
668 void *temp;
669
670 newindex=2*index;
671
672 if(compare(datap1[index],datap1[newindex])>=0) /* reversed compared to filesort_fixed() above */
673 ; /* break */
674 else
675 {
676 temp=datap1[newindex];
677 datap1[newindex]=datap1[index];
678 datap1[index]=temp;
679 }
680 }
681 }
682 }

Properties

Name Value
cvs:description Functions to perform sorting.