Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino
Contents of /trunk/src/sorting.c
Parent Directory
|
Revision Log
Revision 873 -
(show annotations)
(download)
(as text)
Sat Oct 22 14:04:52 2011 UTC (13 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 16126 byte(s)
Sat Oct 22 14:04:52 2011 UTC (13 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 16126 byte(s)
Revert back to r864 zero-based binary heap but with r868/r869 refactored code.
1 | /*************************************** |
2 | Merge sort functions. |
3 | |
4 | Part of the Routino routing software. |
5 | ******************/ /****************** |
6 | This file Copyright 2009-2011 Andrew M. Bishop |
7 | |
8 | This program is free software: you can redistribute it and/or modify |
9 | it under the terms of the GNU Affero General Public License as published by |
10 | the Free Software Foundation, either version 3 of the License, or |
11 | (at your option) any later version. |
12 | |
13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU Affero General Public License for more details. |
17 | |
18 | You should have received a copy of the GNU Affero General Public License |
19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | ***************************************/ |
21 | |
22 | |
23 | #include <stdio.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <assert.h> |
27 | |
28 | #include "types.h" |
29 | |
30 | #include "files.h" |
31 | #include "sorting.h" |
32 | |
33 | |
34 | /* Global variables */ |
35 | |
36 | /*+ The command line '--tmpdir' option or its default value. +*/ |
37 | extern char *option_tmpdirname; |
38 | |
39 | /*+ The amount of RAM to use for filesorting. +*/ |
40 | extern size_t option_filesort_ramsize; |
41 | |
42 | |
43 | /*++++++++++++++++++++++++++++++++++++++ |
44 | A function to sort the contents of a file of fixed length objects using a |
45 | limited amount of RAM. |
46 | |
47 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
48 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
49 | The individual sort steps and the merge step both use a "Heap sort" |
50 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
51 | if the data is already partially sorted. |
52 | |
53 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
54 | |
55 | int fd_out The file descriptor of the output file (opened for writing and empty). |
56 | |
57 | size_t itemsize The size of each item in the file that needs sorting. |
58 | |
59 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
60 | data to be sorted is an array of things not pointers). |
61 | |
62 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
63 | returns 1 then it is written to the output file. |
64 | ++++++++++++++++++++++++++++++++++++++*/ |
65 | |
66 | void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
67 | { |
68 | int *fds=NULL,*heap=NULL; |
69 | int nfiles=0,ndata=0; |
70 | index_t count=0,total=0; |
71 | size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*)); |
72 | void *data=NULL,**datap=NULL; |
73 | char *filename; |
74 | int i,more=1; |
75 | |
76 | /* Allocate the RAM buffer and other bits */ |
77 | |
78 | data=malloc(nitems*itemsize); |
79 | datap=malloc(nitems*sizeof(void*)); |
80 | |
81 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
82 | |
83 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
84 | |
85 | do |
86 | { |
87 | int fd,n=0; |
88 | |
89 | /* Read in the data and create pointers */ |
90 | |
91 | for(i=0;i<nitems;i++) |
92 | { |
93 | datap[i]=data+i*itemsize; |
94 | |
95 | if(ReadFile(fd_in,datap[i],itemsize)) |
96 | { |
97 | more=0; |
98 | break; |
99 | } |
100 | |
101 | total++; |
102 | } |
103 | |
104 | n=i; |
105 | |
106 | /* Shortcut if there is no data and no previous files (i.e. no data at all) */ |
107 | |
108 | if(nfiles==0 && n==0) |
109 | goto tidy_and_exit; |
110 | |
111 | /* No new data read in this time round */ |
112 | |
113 | if(n==0) |
114 | break; |
115 | |
116 | /* Sort the data pointers using a heap sort */ |
117 | |
118 | filesort_heapsort(datap,n,compare); |
119 | |
120 | /* Shortcut if all read in and sorted at once */ |
121 | |
122 | if(nfiles==0 && !more) |
123 | { |
124 | for(i=0;i<n;i++) |
125 | { |
126 | if(!buildindex || buildindex(datap[i],count)) |
127 | { |
128 | WriteFile(fd_out,datap[i],itemsize); |
129 | count++; |
130 | } |
131 | } |
132 | |
133 | goto tidy_and_exit; |
134 | } |
135 | |
136 | /* Create a temporary file and write the result */ |
137 | |
138 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
139 | |
140 | fd=OpenFileNew(filename); |
141 | |
142 | for(i=0;i<n;i++) |
143 | WriteFile(fd,datap[i],itemsize); |
144 | |
145 | CloseFile(fd); |
146 | |
147 | nfiles++; |
148 | } |
149 | while(more); |
150 | |
151 | /* Shortcut if only one file (unlucky for us there must have been exactly |
152 | nitems, lucky for us we still have the data in RAM) */ |
153 | |
154 | if(nfiles==1) |
155 | { |
156 | for(i=0;i<nitems;i++) |
157 | { |
158 | if(!buildindex || buildindex(datap[i],count)) |
159 | { |
160 | WriteFile(fd_out,datap[i],itemsize); |
161 | count++; |
162 | } |
163 | } |
164 | |
165 | DeleteFile(filename); |
166 | |
167 | goto tidy_and_exit; |
168 | } |
169 | |
170 | /* Check that number of files is less than file size */ |
171 | |
172 | assert(nfiles<nitems); |
173 | |
174 | /* Open all of the temporary files */ |
175 | |
176 | fds=(int*)malloc(nfiles*sizeof(int)); |
177 | |
178 | for(i=0;i<nfiles;i++) |
179 | { |
180 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
181 | |
182 | fds[i]=ReOpenFile(filename); |
183 | |
184 | DeleteFile(filename); |
185 | } |
186 | |
187 | /* Perform an n-way merge using a binary heap */ |
188 | |
189 | heap=(int*)malloc(nfiles*sizeof(int)); |
190 | |
191 | /* Fill the heap to start with */ |
192 | |
193 | for(i=0;i<nfiles;i++) |
194 | { |
195 | int index; |
196 | |
197 | datap[i]=data+i*itemsize; |
198 | |
199 | ReadFile(fds[i],datap[i],itemsize); |
200 | |
201 | index=i; |
202 | heap[index]=i; |
203 | |
204 | /* Bubble up the new value */ |
205 | |
206 | while(index>0) |
207 | { |
208 | int newindex; |
209 | int temp; |
210 | |
211 | newindex=(index-1)/2; |
212 | |
213 | if(compare(datap[heap[index]],datap[heap[newindex]])>=0) |
214 | break; |
215 | |
216 | temp=heap[index]; |
217 | heap[index]=heap[newindex]; |
218 | heap[newindex]=temp; |
219 | |
220 | index=newindex; |
221 | } |
222 | } |
223 | |
224 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
225 | |
226 | ndata=nfiles; |
227 | |
228 | do |
229 | { |
230 | int index=0; |
231 | |
232 | if(!buildindex || buildindex(datap[heap[index]],count)) |
233 | { |
234 | WriteFile(fd_out,datap[heap[index]],itemsize); |
235 | count++; |
236 | } |
237 | |
238 | if(ReadFile(fds[heap[index]],datap[heap[index]],itemsize)) |
239 | { |
240 | ndata--; |
241 | heap[index]=heap[ndata]; |
242 | } |
243 | |
244 | /* Bubble down the new value */ |
245 | |
246 | while((2*index+2)<ndata) |
247 | { |
248 | int newindex; |
249 | int temp; |
250 | |
251 | newindex=2*index+1; |
252 | |
253 | if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>0) |
254 | newindex=newindex+1; |
255 | |
256 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
257 | break; |
258 | |
259 | temp=heap[newindex]; |
260 | heap[newindex]=heap[index]; |
261 | heap[index]=temp; |
262 | |
263 | index=newindex; |
264 | } |
265 | |
266 | if((2*index+2)==ndata) |
267 | { |
268 | int newindex; |
269 | int temp; |
270 | |
271 | newindex=2*index+1; |
272 | |
273 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
274 | ; /* break */ |
275 | else |
276 | { |
277 | temp=heap[newindex]; |
278 | heap[newindex]=heap[index]; |
279 | heap[index]=temp; |
280 | } |
281 | } |
282 | } |
283 | while(ndata>0); |
284 | |
285 | /* Tidy up */ |
286 | |
287 | tidy_and_exit: |
288 | |
289 | if(fds) |
290 | { |
291 | for(i=0;i<nfiles;i++) |
292 | CloseFile(fds[i]); |
293 | free(fds); |
294 | } |
295 | |
296 | if(heap) |
297 | free(heap); |
298 | |
299 | free(data); |
300 | free(datap); |
301 | free(filename); |
302 | } |
303 | |
304 | |
305 | /*++++++++++++++++++++++++++++++++++++++ |
306 | A function to sort the contents of a file of variable length objects (each |
307 | preceded by its length in FILESORT_VARSIZE bytes) using a limited amount of RAM. |
308 | |
309 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
310 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
311 | The individual sort steps and the merge step both use a "Heap sort" |
312 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
313 | if the data is already partially sorted. |
314 | |
315 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
316 | |
317 | int fd_out The file descriptor of the output file (opened for writing and empty). |
318 | |
319 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
320 | data to be sorted is an array of things not pointers). |
321 | |
322 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
323 | returns 1 then it is written to the output file. |
324 | ++++++++++++++++++++++++++++++++++++++*/ |
325 | |
326 | void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
327 | { |
328 | int *fds=NULL,*heap=NULL; |
329 | int nfiles=0,ndata=0; |
330 | index_t count=0,total=0; |
331 | FILESORT_VARINT nextitemsize,largestitemsize=0; |
332 | void *data=NULL,**datap=NULL; |
333 | char *filename; |
334 | int i,more=1; |
335 | |
336 | /* Allocate the RAM buffer and other bits */ |
337 | |
338 | data=malloc(option_filesort_ramsize); |
339 | |
340 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
341 | |
342 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
343 | |
344 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */ |
345 | goto tidy_and_exit; |
346 | |
347 | do |
348 | { |
349 | int fd,n=0; |
350 | size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE; |
351 | |
352 | datap=data+option_filesort_ramsize; |
353 | |
354 | /* Read in the data and create pointers */ |
355 | |
356 | while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data)) |
357 | { |
358 | FILESORT_VARINT itemsize=nextitemsize; |
359 | |
360 | if(itemsize>largestitemsize) |
361 | largestitemsize=itemsize; |
362 | |
363 | *(FILESORT_VARINT*)(data+ramused)=itemsize; |
364 | |
365 | ramused+=FILESORT_VARSIZE; |
366 | |
367 | ReadFile(fd_in,data+ramused,itemsize); |
368 | |
369 | *--datap=data+ramused; /* points to real data */ |
370 | |
371 | ramused+=itemsize; |
372 | |
373 | ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN); |
374 | ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE; |
375 | |
376 | total++; |
377 | n++; |
378 | |
379 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) |
380 | { |
381 | more=0; |
382 | break; |
383 | } |
384 | } |
385 | |
386 | /* No new data read in this time round */ |
387 | |
388 | if(n==0) |
389 | break; |
390 | |
391 | /* Sort the data pointers using a heap sort */ |
392 | |
393 | filesort_heapsort(datap,n,compare); |
394 | |
395 | /* Shortcut if all read in and sorted at once */ |
396 | |
397 | if(nfiles==0 && !more) |
398 | { |
399 | for(i=0;i<n;i++) |
400 | { |
401 | if(!buildindex || buildindex(datap[i],count)) |
402 | { |
403 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
404 | |
405 | WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
406 | count++; |
407 | } |
408 | } |
409 | |
410 | goto tidy_and_exit; |
411 | } |
412 | |
413 | /* Create a temporary file and write the result */ |
414 | |
415 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
416 | |
417 | fd=OpenFileNew(filename); |
418 | |
419 | for(i=0;i<n;i++) |
420 | { |
421 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
422 | |
423 | WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
424 | } |
425 | |
426 | CloseFile(fd); |
427 | |
428 | nfiles++; |
429 | } |
430 | while(more); |
431 | |
432 | /* Check that number of files is less than file size */ |
433 | |
434 | largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN); |
435 | |
436 | assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize)); |
437 | |
438 | /* Open all of the temporary files */ |
439 | |
440 | fds=(int*)malloc(nfiles*sizeof(int)); |
441 | |
442 | for(i=0;i<nfiles;i++) |
443 | { |
444 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
445 | |
446 | fds[i]=ReOpenFile(filename); |
447 | |
448 | DeleteFile(filename); |
449 | } |
450 | |
451 | /* Perform an n-way merge using a binary heap */ |
452 | |
453 | heap=(int*)malloc(nfiles*sizeof(int)); |
454 | |
455 | datap=data+option_filesort_ramsize-nfiles*sizeof(void*); |
456 | |
457 | /* Fill the heap to start with */ |
458 | |
459 | for(i=0;i<nfiles;i++) |
460 | { |
461 | int index; |
462 | FILESORT_VARINT itemsize; |
463 | |
464 | datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize; |
465 | |
466 | ReadFile(fds[i],&itemsize,FILESORT_VARSIZE); |
467 | |
468 | *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize; |
469 | |
470 | ReadFile(fds[i],datap[i],itemsize); |
471 | |
472 | index=i; |
473 | heap[index]=i; |
474 | |
475 | /* Bubble up the new value */ |
476 | |
477 | while(index>0) |
478 | { |
479 | int newindex; |
480 | int temp; |
481 | |
482 | newindex=(index-1)/2; |
483 | |
484 | if(compare(datap[heap[index]],datap[heap[newindex]])>=0) |
485 | break; |
486 | |
487 | temp=heap[index]; |
488 | heap[index]=heap[newindex]; |
489 | heap[newindex]=temp; |
490 | |
491 | index=newindex; |
492 | } |
493 | } |
494 | |
495 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
496 | |
497 | ndata=nfiles; |
498 | |
499 | do |
500 | { |
501 | int index=0; |
502 | FILESORT_VARINT itemsize; |
503 | |
504 | if(!buildindex || buildindex(datap[heap[index]],count)) |
505 | { |
506 | itemsize=*(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE); |
507 | |
508 | WriteFile(fd_out,datap[heap[index]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
509 | count++; |
510 | } |
511 | |
512 | if(ReadFile(fds[heap[index]],&itemsize,FILESORT_VARSIZE)) |
513 | { |
514 | ndata--; |
515 | heap[index]=heap[ndata]; |
516 | } |
517 | else |
518 | { |
519 | *(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE)=itemsize; |
520 | |
521 | ReadFile(fds[heap[index]],datap[heap[index]],itemsize); |
522 | } |
523 | |
524 | /* Bubble down the new value */ |
525 | |
526 | while((2*index+2)<ndata) |
527 | { |
528 | int newindex; |
529 | int temp; |
530 | |
531 | newindex=2*index+1; |
532 | |
533 | if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>0) |
534 | newindex=newindex+1; |
535 | |
536 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
537 | break; |
538 | |
539 | temp=heap[newindex]; |
540 | heap[newindex]=heap[index]; |
541 | heap[index]=temp; |
542 | |
543 | index=newindex; |
544 | } |
545 | |
546 | if((2*index+2)==ndata) |
547 | { |
548 | int newindex; |
549 | int temp; |
550 | |
551 | newindex=2*index+1; |
552 | |
553 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
554 | ; /* break */ |
555 | else |
556 | { |
557 | temp=heap[newindex]; |
558 | heap[newindex]=heap[index]; |
559 | heap[index]=temp; |
560 | } |
561 | } |
562 | } |
563 | while(ndata>0); |
564 | |
565 | /* Tidy up */ |
566 | |
567 | tidy_and_exit: |
568 | |
569 | if(fds) |
570 | { |
571 | for(i=0;i<nfiles;i++) |
572 | CloseFile(fds[i]); |
573 | free(fds); |
574 | } |
575 | |
576 | if(heap) |
577 | free(heap); |
578 | |
579 | free(data); |
580 | free(filename); |
581 | } |
582 | |
583 | |
584 | /*++++++++++++++++++++++++++++++++++++++ |
585 | A function to sort an array of pointers efficiently. |
586 | |
587 | The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort, |
588 | in particular, this is good because it can operate in-place and doesn't |
589 | allocate more memory like using qsort() does. |
590 | |
591 | void **datap A pointer to the array of pointers to sort. |
592 | |
593 | size_t nitems The number of items of data to sort. |
594 | |
595 | int(*compare)(const void *, const void *) The comparison function (identical to qsort if the |
596 | data to be sorted was an array of things not pointers). |
597 | ++++++++++++++++++++++++++++++++++++++*/ |
598 | |
599 | void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*)) |
600 | { |
601 | int i; |
602 | |
603 | /* Fill the heap by pretending to insert the data that is already there */ |
604 | |
605 | for(i=1;i<nitems;i++) |
606 | { |
607 | int index=i; |
608 | |
609 | /* Bubble up the new value (upside-down, put largest at top) */ |
610 | |
611 | while(index>0) |
612 | { |
613 | int newindex; |
614 | void *temp; |
615 | |
616 | newindex=(index-1)/2; |
617 | |
618 | if(compare(datap[index],datap[newindex])<=0) /* reversed compared to filesort_fixed() above */ |
619 | break; |
620 | |
621 | temp=datap[index]; |
622 | datap[index]=datap[newindex]; |
623 | datap[newindex]=temp; |
624 | |
625 | index=newindex; |
626 | } |
627 | } |
628 | |
629 | /* Repeatedly pull out the root of the heap and swap with the bottom item */ |
630 | |
631 | for(i=nitems-1;i>0;i--) |
632 | { |
633 | int index=0; |
634 | void *temp; |
635 | |
636 | temp=datap[index]; |
637 | datap[index]=datap[i]; |
638 | datap[i]=temp; |
639 | |
640 | /* Bubble down the new value (upside-down, put largest at top) */ |
641 | |
642 | while((2*index+2)<i) |
643 | { |
644 | int newindex; |
645 | void *temp; |
646 | |
647 | newindex=2*index+1; |
648 | |
649 | if(compare(datap[newindex],datap[newindex+1])<0) /* reversed compared to filesort_fixed() above */ |
650 | newindex=newindex+1; |
651 | |
652 | if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */ |
653 | break; |
654 | |
655 | temp=datap[newindex]; |
656 | datap[newindex]=datap[index]; |
657 | datap[index]=temp; |
658 | |
659 | index=newindex; |
660 | } |
661 | |
662 | if((2*index+2)==i) |
663 | { |
664 | int newindex; |
665 | void *temp; |
666 | |
667 | newindex=2*index+1; |
668 | |
669 | if(compare(datap[index],datap[newindex])>=0) /* reversed compared to filesort_fixed() above */ |
670 | ; /* break */ |
671 | else |
672 | { |
673 | temp=datap[newindex]; |
674 | datap[newindex]=datap[index]; |
675 | datap[index]=temp; |
676 | } |
677 | } |
678 | } |
679 | } |
Properties
Name | Value |
---|---|
cvs:description | Functions to perform sorting. |