Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 543 - (show annotations) (download) (as text)
Sat Dec 18 19:12:03 2010 UTC (14 years, 3 months ago) by amb
File MIME type: text/x-csrc
File size: 16070 byte(s)
Handle the case where there is no data in the file.

1 /***************************************
2 $Header: /home/amb/CVS/routino/src/sorting.c,v 1.13 2010-12-18 19:12:03 amb Exp $
3
4 Merge sort functions.
5
6 Part of the Routino routing software.
7 ******************/ /******************
8 This file Copyright 2009-2010 Andrew M. Bishop
9
10 This program is free software: you can redistribute it and/or modify
11 it under the terms of the GNU Affero General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU Affero General Public License for more details.
19
20 You should have received a copy of the GNU Affero General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 ***************************************/
23
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <assert.h>
29
30 #include "types.h"
31
32 #include "files.h"
33 #include "sorting.h"
34
35
36 /* Variables */
37
38 /*+ The command line '--tmpdir' option or its default value. +*/
39 extern char *option_tmpdirname;
40
41 /*+ The amount of RAM to use for filesorting. +*/
42 extern size_t option_filesort_ramsize;
43
44
45 /*++++++++++++++++++++++++++++++++++++++
46 A function to sort the contents of a file of fixed length objects using a
47 limited amount of RAM.
48
49 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
50 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
51 The individual sort steps and the merge step both use a "Heap sort"
52 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
53 if the data is already partially sorted.
54
55 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
56
57 int fd_out The file descriptor of the output file (opened for writing and empty).
58
59 size_t itemsize The size of each item in the file that needs sorting.
60
61 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
62 data to be sorted is an array of things not pointers).
63
64 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
65 returns 1 then it is written to the output file.
66 ++++++++++++++++++++++++++++++++++++++*/
67
68 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
69 {
70 int *fds=NULL,*heap=NULL;
71 int nfiles=0,ndata=0;
72 index_t count=0,total=0;
73 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
74 void *data=NULL,**datap=NULL;
75 char *filename;
76 int i,more=1;
77
78 /* Allocate the RAM buffer and other bits */
79
80 data=malloc(nitems*itemsize);
81 datap=malloc(nitems*sizeof(void*));
82
83 filename=(char*)malloc(strlen(option_tmpdirname)+24);
84
85 /* Loop around, fill the buffer, sort the data and write a temporary file */
86
87 do
88 {
89 int fd,n=0;
90
91 /* Read in the data and create pointers */
92
93 for(i=0;i<nitems;i++)
94 {
95 datap[i]=data+i*itemsize;
96
97 if(ReadFile(fd_in,datap[i],itemsize))
98 {
99 more=0;
100 break;
101 }
102
103 total++;
104 }
105
106 n=i;
107
108 /* No new data read in this time round */
109
110 if(n==0)
111 break;
112
113 /* Shortcut if there are no files (i.e. no data at all) */
114
115 if(nfiles==0 && n==0)
116 goto tidy_and_exit;
117
118 /* Sort the data pointers using a heap sort */
119
120 filesort_heapsort(datap,n,compare);
121
122 /* Shortcut if all read in and sorted at once */
123
124 if(nfiles==0 && !more)
125 {
126 for(i=0;i<n;i++)
127 {
128 if(!buildindex || buildindex(datap[i],count))
129 {
130 WriteFile(fd_out,datap[i],itemsize);
131 count++;
132 }
133 }
134
135 goto tidy_and_exit;
136 }
137
138 /* Create a temporary file and write the result */
139
140 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
141
142 fd=OpenFileNew(filename);
143
144 for(i=0;i<n;i++)
145 WriteFile(fd,datap[i],itemsize);
146
147 CloseFile(fd);
148
149 nfiles++;
150 }
151 while(more);
152
153 /* Shortcut if only one file (unlucky for us there must have been exactly
154 nitems, lucky for us we still have the data in RAM) */
155
156 if(nfiles==1)
157 {
158 for(i=0;i<nitems;i++)
159 {
160 if(!buildindex || buildindex(datap[i],count))
161 {
162 WriteFile(fd_out,datap[i],itemsize);
163 count++;
164 }
165 }
166
167 DeleteFile(filename);
168
169 goto tidy_and_exit;
170 }
171
172 /* Check that number of files is less than file size */
173
174 assert(nfiles<nitems);
175
176 /* Open all of the temporary files */
177
178 fds=(int*)malloc(nfiles*sizeof(int));
179
180 for(i=0;i<nfiles;i++)
181 {
182 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
183
184 fds[i]=ReOpenFile(filename);
185
186 DeleteFile(filename);
187 }
188
189 /* Perform an n-way merge using a binary heap */
190
191 heap=(int*)malloc(nfiles*sizeof(int));
192
193 /* Fill the heap to start with */
194
195 for(i=0;i<nfiles;i++)
196 {
197 int index;
198
199 datap[i]=data+i*itemsize;
200
201 ReadFile(fds[i],datap[i],itemsize);
202
203 heap[i]=i;
204
205 index=i;
206
207 /* Bubble up the new value */
208
209 while(index>0 &&
210 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
211 {
212 int newindex;
213 int temp;
214
215 newindex=(index-1)/2;
216
217 temp=heap[index];
218 heap[index]=heap[newindex];
219 heap[newindex]=temp;
220
221 index=newindex;
222 }
223 }
224
225 /* Repeatedly pull out the root of the heap and refill from the same file */
226
227 ndata=nfiles;
228
229 do
230 {
231 int index=0;
232
233 if(!buildindex || buildindex(datap[heap[0]],count))
234 {
235 WriteFile(fd_out,datap[heap[0]],itemsize);
236 count++;
237 }
238
239 if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize))
240 {
241 ndata--;
242 heap[0]=heap[ndata];
243 }
244
245 /* Bubble down the new value */
246
247 while((2*index+2)<ndata &&
248 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
249 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
250 {
251 int newindex;
252 int temp;
253
254 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
255 newindex=2*index+1;
256 else
257 newindex=2*index+2;
258
259 temp=heap[newindex];
260 heap[newindex]=heap[index];
261 heap[index]=temp;
262
263 index=newindex;
264 }
265
266 if((2*index+2)==ndata &&
267 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
268 {
269 int newindex;
270 int temp;
271
272 newindex=2*index+1;
273
274 temp=heap[newindex];
275 heap[newindex]=heap[index];
276 heap[index]=temp;
277 }
278 }
279 while(ndata>0);
280
281 /* Tidy up */
282
283 tidy_and_exit:
284
285 if(fds)
286 {
287 for(i=0;i<nfiles;i++)
288 CloseFile(fds[i]);
289 free(fds);
290 }
291
292 if(heap)
293 free(heap);
294
295 free(data);
296 free(datap);
297 free(filename);
298 }
299
300
301 /*++++++++++++++++++++++++++++++++++++++
302 A function to sort the contents of a file of variable length objects (each
303 preceded by its length in 2 bytes) using a limited amount of RAM.
304
305 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
306 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
307 The individual sort steps and the merge step both use a "Heap sort"
308 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
309 if the data is already partially sorted.
310
311 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
312
313 int fd_out The file descriptor of the output file (opened for writing and empty).
314
315 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
316 data to be sorted is an array of things not pointers).
317
318 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
319 returns 1 then it is written to the output file.
320 ++++++++++++++++++++++++++++++++++++++*/
321
322 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
323 {
324 int *fds=NULL,*heap=NULL;
325 int nfiles=0,ndata=0;
326 index_t count=0,total=0;
327 FILESORT_VARINT nextitemsize,largestitemsize=0;
328 void *data=NULL,**datap=NULL;
329 char *filename;
330 int i,more=1;
331
332 /* Allocate the RAM buffer and other bits */
333
334 data=malloc(option_filesort_ramsize);
335
336 filename=(char*)malloc(strlen(option_tmpdirname)+24);
337
338 /* Loop around, fill the buffer, sort the data and write a temporary file */
339
340 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
341 goto tidy_and_exit;
342
343 do
344 {
345 int fd,n=0;
346 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
347
348 datap=data+option_filesort_ramsize;
349
350 /* Read in the data and create pointers */
351
352 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
353 {
354 FILESORT_VARINT itemsize=nextitemsize;
355
356 if(itemsize>largestitemsize)
357 largestitemsize=itemsize;
358
359 *(FILESORT_VARINT*)(data+ramused)=itemsize;
360
361 ramused+=FILESORT_VARSIZE;
362
363 ReadFile(fd_in,data+ramused,itemsize);
364
365 *--datap=data+ramused; /* points to real data */
366
367 ramused+=itemsize;
368
369 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
370 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
371
372 total++;
373 n++;
374
375 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
376 {
377 more=0;
378 break;
379 }
380 }
381
382 /* No new data read in this time round */
383
384 if(n==0)
385 break;
386
387 /* Sort the data pointers using a heap sort */
388
389 filesort_heapsort(datap,n,compare);
390
391 /* Shortcut if all read in and sorted at once */
392
393 if(nfiles==0 && !more)
394 {
395 for(i=0;i<n;i++)
396 {
397 if(!buildindex || buildindex(datap[i],count))
398 {
399 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
400
401 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
402 count++;
403 }
404 }
405
406 goto tidy_and_exit;
407 }
408
409 /* Create a temporary file and write the result */
410
411 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
412
413 fd=OpenFileNew(filename);
414
415 for(i=0;i<n;i++)
416 {
417 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
418
419 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
420 }
421
422 CloseFile(fd);
423
424 nfiles++;
425 }
426 while(more);
427
428 /* Check that number of files is less than file size */
429
430 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
431
432 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
433
434 /* Open all of the temporary files */
435
436 fds=(int*)malloc(nfiles*sizeof(int));
437
438 for(i=0;i<nfiles;i++)
439 {
440 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
441
442 fds[i]=ReOpenFile(filename);
443
444 DeleteFile(filename);
445 }
446
447 /* Perform an n-way merge using a binary heap */
448
449 heap=(int*)malloc(nfiles*sizeof(int));
450
451 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
452
453 /* Fill the heap to start with */
454
455 for(i=0;i<nfiles;i++)
456 {
457 int index;
458 FILESORT_VARINT itemsize;
459
460 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
461
462 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
463
464 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
465
466 ReadFile(fds[i],datap[i],itemsize);
467
468 heap[i]=i;
469
470 index=i;
471
472 /* Bubble up the new value */
473
474 while(index>0 &&
475 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
476 {
477 int newindex;
478 int temp;
479
480 newindex=(index-1)/2;
481
482 temp=heap[index];
483 heap[index]=heap[newindex];
484 heap[newindex]=temp;
485
486 index=newindex;
487 }
488 }
489
490 /* Repeatedly pull out the root of the heap and refill from the same file */
491
492 ndata=nfiles;
493
494 do
495 {
496 int index=0;
497 FILESORT_VARINT itemsize;
498
499 if(!buildindex || buildindex(datap[heap[0]],count))
500 {
501 itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE);
502
503 WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
504 count++;
505 }
506
507 if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE))
508 {
509 ndata--;
510 heap[0]=heap[ndata];
511 }
512 else
513 {
514 *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize;
515
516 ReadFile(fds[heap[0]],datap[heap[0]],itemsize);
517 }
518
519 /* Bubble down the new value */
520
521 while((2*index+2)<ndata &&
522 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
523 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
524 {
525 int newindex;
526 int temp;
527
528 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
529 newindex=2*index+1;
530 else
531 newindex=2*index+2;
532
533 temp=heap[newindex];
534 heap[newindex]=heap[index];
535 heap[index]=temp;
536
537 index=newindex;
538 }
539
540 if((2*index+2)==ndata &&
541 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
542 {
543 int newindex;
544 int temp;
545
546 newindex=2*index+1;
547
548 temp=heap[newindex];
549 heap[newindex]=heap[index];
550 heap[index]=temp;
551 }
552 }
553 while(ndata>0);
554
555 /* Tidy up */
556
557 tidy_and_exit:
558
559 if(fds)
560 {
561 for(i=0;i<nfiles;i++)
562 CloseFile(fds[i]);
563 free(fds);
564 }
565
566 if(heap)
567 free(heap);
568
569 free(data);
570 free(filename);
571 }
572
573
574 /*++++++++++++++++++++++++++++++++++++++
575 A function to sort an array of pointers efficiently.
576
577 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
578 in particular an this good because it can operate in-place and doesn't
579 allocate more memory like using qsort() does.
580
581 void **datap A pointer to the array of pointers to sort.
582
583 size_t nitems The number of items of data to sort.
584
585 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
586 data to be sorted was an array of things not pointers).
587 ++++++++++++++++++++++++++++++++++++++*/
588
589 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
590 {
591 int i;
592
593 /* Fill the heap by pretending to insert the data that is already there */
594
595 for(i=1;i<nitems;i++)
596 {
597 int index=i;
598
599 /* Bubble up the new value (upside-down, put largest at top) */
600
601 while(index>0 &&
602 compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */
603 {
604 int newindex;
605 void *temp;
606
607 newindex=(index-1)/2;
608
609 temp=datap[index];
610 datap[index]=datap[newindex];
611 datap[newindex]=temp;
612
613 index=newindex;
614 }
615 }
616
617 /* Repeatedly pull out the root of the heap and swap with the bottom item */
618
619 for(i=nitems-1;i>0;i--)
620 {
621 int index=0;
622 void *temp;
623
624 temp=datap[index];
625 datap[index]=datap[i];
626 datap[i]=temp;
627
628 /* Bubble down the new value (upside-down, put largest at top) */
629
630 while((2*index+2)<i &&
631 (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */
632 compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */
633 {
634 int newindex;
635 void *temp;
636
637 if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */
638 newindex=2*index+1;
639 else
640 newindex=2*index+2;
641
642 temp=datap[newindex];
643 datap[newindex]=datap[index];
644 datap[index]=temp;
645
646 index=newindex;
647 }
648
649 if((2*index+2)==i &&
650 compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */
651 {
652 int newindex;
653 void *temp;
654
655 newindex=2*index+1;
656
657 temp=datap[newindex];
658 datap[newindex]=datap[index];
659 datap[index]=temp;
660 }
661 }
662 }

Properties

Name Value
cvs:description Functions to perform sorting.