Routino SVN Repository Browser

Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino

ViewVC logotype

Contents of /trunk/src/sorting.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 503 - (show annotations) (download) (as text)
Sat Sep 25 13:54:35 2010 UTC (14 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 15840 byte(s)
Rename the heapsort() function to filesort_heapsort().

1 /***************************************
2 $Header: /home/amb/CVS/routino/src/sorting.c,v 1.11 2010-09-25 13:54:18 amb Exp $
3
4 Merge sort functions.
5
6 Part of the Routino routing software.
7 ******************/ /******************
8 This file Copyright 2009-2010 Andrew M. Bishop
9
10 This program is free software: you can redistribute it and/or modify
11 it under the terms of the GNU Affero General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU Affero General Public License for more details.
19
20 You should have received a copy of the GNU Affero General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 ***************************************/
23
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <assert.h>
29
30 #include "files.h"
31 #include "functions.h"
32
33
34 /* Variables */
35
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
38
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
41
42
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
46
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
52
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
54
55 int fd_out The file descriptor of the output file (opened for writing and empty).
56
57 size_t itemsize The size of each item in the file that needs sorting.
58
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
61
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
65
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
67 {
68 int *fds=NULL,*heap=NULL;
69 int nfiles=0,ndata=0;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
73 char *filename;
74 int i,more=1;
75
76 /* Allocate the RAM buffer and other bits */
77
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
80
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
82
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
84
85 do
86 {
87 int fd,n=0;
88
89 /* Read in the data and create pointers */
90
91 for(i=0;i<nitems;i++)
92 {
93 datap[i]=data+i*itemsize;
94
95 if(ReadFile(fd_in,datap[i],itemsize))
96 {
97 more=0;
98 break;
99 }
100
101 total++;
102 }
103
104 n=i;
105
106 if(n==0)
107 break;
108
109 /* Sort the data pointers using a heap sort */
110
111 filesort_heapsort(datap,n,compare);
112
113 /* Shortcut if all read in and sorted at once */
114
115 if(nfiles==0 && !more)
116 {
117 for(i=0;i<n;i++)
118 {
119 if(!buildindex || buildindex(datap[i],count))
120 {
121 WriteFile(fd_out,datap[i],itemsize);
122 count++;
123 }
124 }
125
126 goto tidy_and_exit;
127 }
128
129 /* Create a temporary file and write the result */
130
131 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
132
133 fd=OpenFileNew(filename);
134
135 for(i=0;i<n;i++)
136 WriteFile(fd,datap[i],itemsize);
137
138 CloseFile(fd);
139
140 nfiles++;
141 }
142 while(more);
143
144 /* Shortcut if only one file (unlucky for us there must have been exactly
145 nitems, lucky for us we still have the data in RAM) */
146
147 if(nfiles==1)
148 {
149 for(i=0;i<nitems;i++)
150 {
151 if(!buildindex || buildindex(datap[i],count))
152 {
153 WriteFile(fd_out,datap[i],itemsize);
154 count++;
155 }
156 }
157
158 DeleteFile(filename);
159
160 goto tidy_and_exit;
161 }
162
163 /* Check that number of files is less than file size */
164
165 assert(nfiles<nitems);
166
167 /* Open all of the temporary files */
168
169 fds=(int*)malloc(nfiles*sizeof(int));
170
171 for(i=0;i<nfiles;i++)
172 {
173 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
174
175 fds[i]=ReOpenFile(filename);
176
177 DeleteFile(filename);
178 }
179
180 /* Perform an n-way merge using a binary heap */
181
182 heap=(int*)malloc(nfiles*sizeof(int));
183
184 /* Fill the heap to start with */
185
186 for(i=0;i<nfiles;i++)
187 {
188 int index;
189
190 datap[i]=data+i*itemsize;
191
192 ReadFile(fds[i],datap[i],itemsize);
193
194 heap[i]=i;
195
196 index=i;
197
198 /* Bubble up the new value */
199
200 while(index>0 &&
201 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
202 {
203 int newindex;
204 int temp;
205
206 newindex=(index-1)/2;
207
208 temp=heap[index];
209 heap[index]=heap[newindex];
210 heap[newindex]=temp;
211
212 index=newindex;
213 }
214 }
215
216 /* Repeatedly pull out the root of the heap and refill from the same file */
217
218 ndata=nfiles;
219
220 do
221 {
222 int index=0;
223
224 if(!buildindex || buildindex(datap[heap[0]],count))
225 {
226 WriteFile(fd_out,datap[heap[0]],itemsize);
227 count++;
228 }
229
230 if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize))
231 {
232 ndata--;
233 heap[0]=heap[ndata];
234 }
235
236 /* Bubble down the new value */
237
238 while((2*index+2)<ndata &&
239 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
240 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
241 {
242 int newindex;
243 int temp;
244
245 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
246 newindex=2*index+1;
247 else
248 newindex=2*index+2;
249
250 temp=heap[newindex];
251 heap[newindex]=heap[index];
252 heap[index]=temp;
253
254 index=newindex;
255 }
256
257 if((2*index+2)==ndata &&
258 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
259 {
260 int newindex;
261 int temp;
262
263 newindex=2*index+1;
264
265 temp=heap[newindex];
266 heap[newindex]=heap[index];
267 heap[index]=temp;
268 }
269 }
270 while(ndata>0);
271
272 /* Tidy up */
273
274 tidy_and_exit:
275
276 if(fds)
277 {
278 for(i=0;i<nfiles;i++)
279 CloseFile(fds[i]);
280 free(fds);
281 }
282
283 if(heap)
284 free(heap);
285
286 free(data);
287 free(datap);
288 free(filename);
289 }
290
291
292 /*++++++++++++++++++++++++++++++++++++++
293 A function to sort the contents of a file of variable length objects (each
294 preceded by its length in 2 bytes) using a limited amount of RAM.
295
296 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
297 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
298 The individual sort steps and the merge step both use a "Heap sort"
299 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
300 if the data is already partially sorted.
301
302 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
303
304 int fd_out The file descriptor of the output file (opened for writing and empty).
305
306 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
307 data to be sorted is an array of things not pointers).
308
309 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
310 returns 1 then it is written to the output file.
311 ++++++++++++++++++++++++++++++++++++++*/
312
313 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
314 {
315 int *fds=NULL,*heap=NULL;
316 int nfiles=0,ndata=0;
317 index_t count=0,total=0;
318 FILESORT_VARINT nextitemsize,largestitemsize=0;
319 void *data=NULL,**datap=NULL;
320 char *filename;
321 int i,more=1;
322
323 /* Allocate the RAM buffer and other bits */
324
325 data=malloc(option_filesort_ramsize);
326
327 filename=(char*)malloc(strlen(option_tmpdirname)+24);
328
329 /* Loop around, fill the buffer, sort the data and write a temporary file */
330
331 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
332 goto tidy_and_exit;
333
334 do
335 {
336 int fd,n=0;
337 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
338
339 datap=data+option_filesort_ramsize;
340
341 /* Read in the data and create pointers */
342
343 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
344 {
345 FILESORT_VARINT itemsize=nextitemsize;
346
347 if(itemsize>largestitemsize)
348 largestitemsize=itemsize;
349
350 *(FILESORT_VARINT*)(data+ramused)=itemsize;
351
352 ramused+=FILESORT_VARSIZE;
353
354 ReadFile(fd_in,data+ramused,itemsize);
355
356 *--datap=data+ramused; /* points to real data */
357
358 ramused+=itemsize;
359
360 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
361 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
362
363 total++;
364 n++;
365
366 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
367 {
368 more=0;
369 break;
370 }
371 }
372
373 if(n==0)
374 break;
375
376 /* Sort the data pointers using a heap sort */
377
378 filesort_heapsort(datap,n,compare);
379
380 /* Shortcut if all read in and sorted at once */
381
382 if(nfiles==0 && !more)
383 {
384 for(i=0;i<n;i++)
385 {
386 if(!buildindex || buildindex(datap[i],count))
387 {
388 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
389
390 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
391 count++;
392 }
393 }
394
395 goto tidy_and_exit;
396 }
397
398 /* Create a temporary file and write the result */
399
400 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
401
402 fd=OpenFileNew(filename);
403
404 for(i=0;i<n;i++)
405 {
406 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
407
408 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
409 }
410
411 CloseFile(fd);
412
413 nfiles++;
414 }
415 while(more);
416
417 /* Check that number of files is less than file size */
418
419 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
420
421 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
422
423 /* Open all of the temporary files */
424
425 fds=(int*)malloc(nfiles*sizeof(int));
426
427 for(i=0;i<nfiles;i++)
428 {
429 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
430
431 fds[i]=ReOpenFile(filename);
432
433 DeleteFile(filename);
434 }
435
436 /* Perform an n-way merge using a binary heap */
437
438 heap=(int*)malloc(nfiles*sizeof(int));
439
440 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
441
442 /* Fill the heap to start with */
443
444 for(i=0;i<nfiles;i++)
445 {
446 int index;
447 FILESORT_VARINT itemsize;
448
449 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
450
451 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
452
453 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
454
455 ReadFile(fds[i],datap[i],itemsize);
456
457 heap[i]=i;
458
459 index=i;
460
461 /* Bubble up the new value */
462
463 while(index>0 &&
464 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
465 {
466 int newindex;
467 int temp;
468
469 newindex=(index-1)/2;
470
471 temp=heap[index];
472 heap[index]=heap[newindex];
473 heap[newindex]=temp;
474
475 index=newindex;
476 }
477 }
478
479 /* Repeatedly pull out the root of the heap and refill from the same file */
480
481 ndata=nfiles;
482
483 do
484 {
485 int index=0;
486 FILESORT_VARINT itemsize;
487
488 if(!buildindex || buildindex(datap[heap[0]],count))
489 {
490 itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE);
491
492 WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
493 count++;
494 }
495
496 if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE))
497 {
498 ndata--;
499 heap[0]=heap[ndata];
500 }
501 else
502 {
503 *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize;
504
505 ReadFile(fds[heap[0]],datap[heap[0]],itemsize);
506 }
507
508 /* Bubble down the new value */
509
510 while((2*index+2)<ndata &&
511 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
512 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
513 {
514 int newindex;
515 int temp;
516
517 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
518 newindex=2*index+1;
519 else
520 newindex=2*index+2;
521
522 temp=heap[newindex];
523 heap[newindex]=heap[index];
524 heap[index]=temp;
525
526 index=newindex;
527 }
528
529 if((2*index+2)==ndata &&
530 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
531 {
532 int newindex;
533 int temp;
534
535 newindex=2*index+1;
536
537 temp=heap[newindex];
538 heap[newindex]=heap[index];
539 heap[index]=temp;
540 }
541 }
542 while(ndata>0);
543
544 /* Tidy up */
545
546 tidy_and_exit:
547
548 if(fds)
549 {
550 for(i=0;i<nfiles;i++)
551 CloseFile(fds[i]);
552 free(fds);
553 }
554
555 if(heap)
556 free(heap);
557
558 free(data);
559 free(filename);
560 }
561
562
563 /*++++++++++++++++++++++++++++++++++++++
564 A function to sort an array of pointers efficiently.
565
566 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
567 in particular an this good because it can operate in-place and doesn't
568 allocate more memory like using qsort() does.
569
570 void **datap A pointer to the array of pointers to sort.
571
572 size_t nitems The number of items of data to sort.
573
574 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
575 data to be sorted was an array of things not pointers).
576 ++++++++++++++++++++++++++++++++++++++*/
577
578 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
579 {
580 int i;
581
582 /* Fill the heap by pretending to insert the data that is already there */
583
584 for(i=1;i<nitems;i++)
585 {
586 int index=i;
587
588 /* Bubble up the new value (upside-down, put largest at top) */
589
590 while(index>0 &&
591 compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */
592 {
593 int newindex;
594 void *temp;
595
596 newindex=(index-1)/2;
597
598 temp=datap[index];
599 datap[index]=datap[newindex];
600 datap[newindex]=temp;
601
602 index=newindex;
603 }
604 }
605
606 /* Repeatedly pull out the root of the heap and swap with the bottom item */
607
608 for(i=nitems-1;i>0;i--)
609 {
610 int index=0;
611 void *temp;
612
613 temp=datap[index];
614 datap[index]=datap[i];
615 datap[i]=temp;
616
617 /* Bubble down the new value (upside-down, put largest at top) */
618
619 while((2*index+2)<i &&
620 (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */
621 compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */
622 {
623 int newindex;
624 void *temp;
625
626 if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */
627 newindex=2*index+1;
628 else
629 newindex=2*index+2;
630
631 temp=datap[newindex];
632 datap[newindex]=datap[index];
633 datap[index]=temp;
634
635 index=newindex;
636 }
637
638 if((2*index+2)==i &&
639 compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */
640 {
641 int newindex;
642 void *temp;
643
644 newindex=2*index+1;
645
646 temp=datap[newindex];
647 datap[newindex]=datap[index];
648 datap[index]=temp;
649 }
650 }
651 }

Properties

Name Value
cvs:description Functions to perform sorting.