Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino
Contents of /trunk/src/sorting.c
Parent Directory
|
Revision Log
Revision 680 -
(show annotations)
(download)
(as text)
Sun Apr 24 15:14:53 2011 UTC (13 years, 11 months ago) by amb
File MIME type: text/x-csrc
File size: 16013 byte(s)
Sun Apr 24 15:14:53 2011 UTC (13 years, 11 months ago) by amb
File MIME type: text/x-csrc
File size: 16013 byte(s)
Update comments throughout the source code.
1 | /*************************************** |
2 | Merge sort functions. |
3 | |
4 | Part of the Routino routing software. |
5 | ******************/ /****************** |
6 | This file Copyright 2009-2011 Andrew M. Bishop |
7 | |
8 | This program is free software: you can redistribute it and/or modify |
9 | it under the terms of the GNU Affero General Public License as published by |
10 | the Free Software Foundation, either version 3 of the License, or |
11 | (at your option) any later version. |
12 | |
13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU Affero General Public License for more details. |
17 | |
18 | You should have received a copy of the GNU Affero General Public License |
19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | ***************************************/ |
21 | |
22 | |
23 | #include <stdio.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <assert.h> |
27 | |
28 | #include "types.h" |
29 | |
30 | #include "files.h" |
31 | #include "sorting.h" |
32 | |
33 | |
34 | /* Global variables */ |
35 | |
36 | /*+ The command line '--tmpdir' option or its default value. +*/ |
37 | extern char *option_tmpdirname; |
38 | |
39 | /*+ The amount of RAM to use for filesorting. +*/ |
40 | extern size_t option_filesort_ramsize; |
41 | |
42 | |
43 | /*++++++++++++++++++++++++++++++++++++++ |
44 | A function to sort the contents of a file of fixed length objects using a |
45 | limited amount of RAM. |
46 | |
47 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
48 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
49 | The individual sort steps and the merge step both use a "Heap sort" |
50 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
51 | if the data is already partially sorted. |
52 | |
53 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
54 | |
55 | int fd_out The file descriptor of the output file (opened for writing and empty). |
56 | |
57 | size_t itemsize The size of each item in the file that needs sorting. |
58 | |
59 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
60 | data to be sorted is an array of things not pointers). |
61 | |
62 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
63 | returns 1 then it is written to the output file. |
64 | ++++++++++++++++++++++++++++++++++++++*/ |
65 | |
66 | void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
67 | { |
68 | int *fds=NULL,*heap=NULL; |
69 | int nfiles=0,ndata=0; |
70 | index_t count=0,total=0; |
71 | size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*)); |
72 | void *data=NULL,**datap=NULL; |
73 | char *filename; |
74 | int i,more=1; |
75 | |
76 | /* Allocate the RAM buffer and other bits */ |
77 | |
78 | data=malloc(nitems*itemsize); |
79 | datap=malloc(nitems*sizeof(void*)); |
80 | |
81 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
82 | |
83 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
84 | |
85 | do |
86 | { |
87 | int fd,n=0; |
88 | |
89 | /* Read in the data and create pointers */ |
90 | |
91 | for(i=0;i<nitems;i++) |
92 | { |
93 | datap[i]=data+i*itemsize; |
94 | |
95 | if(ReadFile(fd_in,datap[i],itemsize)) |
96 | { |
97 | more=0; |
98 | break; |
99 | } |
100 | |
101 | total++; |
102 | } |
103 | |
104 | n=i; |
105 | |
106 | /* Shortcut if there is no data and no previous files (i.e. no data at all) */ |
107 | |
108 | if(nfiles==0 && n==0) |
109 | goto tidy_and_exit; |
110 | |
111 | /* No new data read in this time round */ |
112 | |
113 | if(n==0) |
114 | break; |
115 | |
116 | /* Sort the data pointers using a heap sort */ |
117 | |
118 | filesort_heapsort(datap,n,compare); |
119 | |
120 | /* Shortcut if all read in and sorted at once */ |
121 | |
122 | if(nfiles==0 && !more) |
123 | { |
124 | for(i=0;i<n;i++) |
125 | { |
126 | if(!buildindex || buildindex(datap[i],count)) |
127 | { |
128 | WriteFile(fd_out,datap[i],itemsize); |
129 | count++; |
130 | } |
131 | } |
132 | |
133 | goto tidy_and_exit; |
134 | } |
135 | |
136 | /* Create a temporary file and write the result */ |
137 | |
138 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
139 | |
140 | fd=OpenFileNew(filename); |
141 | |
142 | for(i=0;i<n;i++) |
143 | WriteFile(fd,datap[i],itemsize); |
144 | |
145 | CloseFile(fd); |
146 | |
147 | nfiles++; |
148 | } |
149 | while(more); |
150 | |
151 | /* Shortcut if only one file (unlucky for us there must have been exactly |
152 | nitems, lucky for us we still have the data in RAM) */ |
153 | |
154 | if(nfiles==1) |
155 | { |
156 | for(i=0;i<nitems;i++) |
157 | { |
158 | if(!buildindex || buildindex(datap[i],count)) |
159 | { |
160 | WriteFile(fd_out,datap[i],itemsize); |
161 | count++; |
162 | } |
163 | } |
164 | |
165 | DeleteFile(filename); |
166 | |
167 | goto tidy_and_exit; |
168 | } |
169 | |
170 | /* Check that number of files is less than file size */ |
171 | |
172 | assert(nfiles<nitems); |
173 | |
174 | /* Open all of the temporary files */ |
175 | |
176 | fds=(int*)malloc(nfiles*sizeof(int)); |
177 | |
178 | for(i=0;i<nfiles;i++) |
179 | { |
180 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
181 | |
182 | fds[i]=ReOpenFile(filename); |
183 | |
184 | DeleteFile(filename); |
185 | } |
186 | |
187 | /* Perform an n-way merge using a binary heap */ |
188 | |
189 | heap=(int*)malloc(nfiles*sizeof(int)); |
190 | |
191 | /* Fill the heap to start with */ |
192 | |
193 | for(i=0;i<nfiles;i++) |
194 | { |
195 | int index; |
196 | |
197 | datap[i]=data+i*itemsize; |
198 | |
199 | ReadFile(fds[i],datap[i],itemsize); |
200 | |
201 | heap[i]=i; |
202 | |
203 | index=i; |
204 | |
205 | /* Bubble up the new value */ |
206 | |
207 | while(index>0 && |
208 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
209 | { |
210 | int newindex; |
211 | int temp; |
212 | |
213 | newindex=(index-1)/2; |
214 | |
215 | temp=heap[index]; |
216 | heap[index]=heap[newindex]; |
217 | heap[newindex]=temp; |
218 | |
219 | index=newindex; |
220 | } |
221 | } |
222 | |
223 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
224 | |
225 | ndata=nfiles; |
226 | |
227 | do |
228 | { |
229 | int index=0; |
230 | |
231 | if(!buildindex || buildindex(datap[heap[0]],count)) |
232 | { |
233 | WriteFile(fd_out,datap[heap[0]],itemsize); |
234 | count++; |
235 | } |
236 | |
237 | if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize)) |
238 | { |
239 | ndata--; |
240 | heap[0]=heap[ndata]; |
241 | } |
242 | |
243 | /* Bubble down the new value */ |
244 | |
245 | while((2*index+2)<ndata && |
246 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
247 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
248 | { |
249 | int newindex; |
250 | int temp; |
251 | |
252 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
253 | newindex=2*index+1; |
254 | else |
255 | newindex=2*index+2; |
256 | |
257 | temp=heap[newindex]; |
258 | heap[newindex]=heap[index]; |
259 | heap[index]=temp; |
260 | |
261 | index=newindex; |
262 | } |
263 | |
264 | if((2*index+2)==ndata && |
265 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
266 | { |
267 | int newindex; |
268 | int temp; |
269 | |
270 | newindex=2*index+1; |
271 | |
272 | temp=heap[newindex]; |
273 | heap[newindex]=heap[index]; |
274 | heap[index]=temp; |
275 | } |
276 | } |
277 | while(ndata>0); |
278 | |
279 | /* Tidy up */ |
280 | |
281 | tidy_and_exit: |
282 | |
283 | if(fds) |
284 | { |
285 | for(i=0;i<nfiles;i++) |
286 | CloseFile(fds[i]); |
287 | free(fds); |
288 | } |
289 | |
290 | if(heap) |
291 | free(heap); |
292 | |
293 | free(data); |
294 | free(datap); |
295 | free(filename); |
296 | } |
297 | |
298 | |
299 | /*++++++++++++++++++++++++++++++++++++++ |
300 | A function to sort the contents of a file of variable length objects (each |
301 | preceded by its length in 2 bytes) using a limited amount of RAM. |
302 | |
303 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
304 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
305 | The individual sort steps and the merge step both use a "Heap sort" |
306 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
307 | if the data is already partially sorted. |
308 | |
309 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
310 | |
311 | int fd_out The file descriptor of the output file (opened for writing and empty). |
312 | |
313 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
314 | data to be sorted is an array of things not pointers). |
315 | |
316 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
317 | returns 1 then it is written to the output file. |
318 | ++++++++++++++++++++++++++++++++++++++*/ |
319 | |
320 | void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
321 | { |
322 | int *fds=NULL,*heap=NULL; |
323 | int nfiles=0,ndata=0; |
324 | index_t count=0,total=0; |
325 | FILESORT_VARINT nextitemsize,largestitemsize=0; |
326 | void *data=NULL,**datap=NULL; |
327 | char *filename; |
328 | int i,more=1; |
329 | |
330 | /* Allocate the RAM buffer and other bits */ |
331 | |
332 | data=malloc(option_filesort_ramsize); |
333 | |
334 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
335 | |
336 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
337 | |
338 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */ |
339 | goto tidy_and_exit; |
340 | |
341 | do |
342 | { |
343 | int fd,n=0; |
344 | size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE; |
345 | |
346 | datap=data+option_filesort_ramsize; |
347 | |
348 | /* Read in the data and create pointers */ |
349 | |
350 | while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data)) |
351 | { |
352 | FILESORT_VARINT itemsize=nextitemsize; |
353 | |
354 | if(itemsize>largestitemsize) |
355 | largestitemsize=itemsize; |
356 | |
357 | *(FILESORT_VARINT*)(data+ramused)=itemsize; |
358 | |
359 | ramused+=FILESORT_VARSIZE; |
360 | |
361 | ReadFile(fd_in,data+ramused,itemsize); |
362 | |
363 | *--datap=data+ramused; /* points to real data */ |
364 | |
365 | ramused+=itemsize; |
366 | |
367 | ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN); |
368 | ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE; |
369 | |
370 | total++; |
371 | n++; |
372 | |
373 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) |
374 | { |
375 | more=0; |
376 | break; |
377 | } |
378 | } |
379 | |
380 | /* No new data read in this time round */ |
381 | |
382 | if(n==0) |
383 | break; |
384 | |
385 | /* Sort the data pointers using a heap sort */ |
386 | |
387 | filesort_heapsort(datap,n,compare); |
388 | |
389 | /* Shortcut if all read in and sorted at once */ |
390 | |
391 | if(nfiles==0 && !more) |
392 | { |
393 | for(i=0;i<n;i++) |
394 | { |
395 | if(!buildindex || buildindex(datap[i],count)) |
396 | { |
397 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
398 | |
399 | WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
400 | count++; |
401 | } |
402 | } |
403 | |
404 | goto tidy_and_exit; |
405 | } |
406 | |
407 | /* Create a temporary file and write the result */ |
408 | |
409 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
410 | |
411 | fd=OpenFileNew(filename); |
412 | |
413 | for(i=0;i<n;i++) |
414 | { |
415 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
416 | |
417 | WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
418 | } |
419 | |
420 | CloseFile(fd); |
421 | |
422 | nfiles++; |
423 | } |
424 | while(more); |
425 | |
426 | /* Check that number of files is less than file size */ |
427 | |
428 | largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN); |
429 | |
430 | assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize)); |
431 | |
432 | /* Open all of the temporary files */ |
433 | |
434 | fds=(int*)malloc(nfiles*sizeof(int)); |
435 | |
436 | for(i=0;i<nfiles;i++) |
437 | { |
438 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
439 | |
440 | fds[i]=ReOpenFile(filename); |
441 | |
442 | DeleteFile(filename); |
443 | } |
444 | |
445 | /* Perform an n-way merge using a binary heap */ |
446 | |
447 | heap=(int*)malloc(nfiles*sizeof(int)); |
448 | |
449 | datap=data+option_filesort_ramsize-nfiles*sizeof(void*); |
450 | |
451 | /* Fill the heap to start with */ |
452 | |
453 | for(i=0;i<nfiles;i++) |
454 | { |
455 | int index; |
456 | FILESORT_VARINT itemsize; |
457 | |
458 | datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize; |
459 | |
460 | ReadFile(fds[i],&itemsize,FILESORT_VARSIZE); |
461 | |
462 | *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize; |
463 | |
464 | ReadFile(fds[i],datap[i],itemsize); |
465 | |
466 | heap[i]=i; |
467 | |
468 | index=i; |
469 | |
470 | /* Bubble up the new value */ |
471 | |
472 | while(index>0 && |
473 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
474 | { |
475 | int newindex; |
476 | int temp; |
477 | |
478 | newindex=(index-1)/2; |
479 | |
480 | temp=heap[index]; |
481 | heap[index]=heap[newindex]; |
482 | heap[newindex]=temp; |
483 | |
484 | index=newindex; |
485 | } |
486 | } |
487 | |
488 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
489 | |
490 | ndata=nfiles; |
491 | |
492 | do |
493 | { |
494 | int index=0; |
495 | FILESORT_VARINT itemsize; |
496 | |
497 | if(!buildindex || buildindex(datap[heap[0]],count)) |
498 | { |
499 | itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE); |
500 | |
501 | WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
502 | count++; |
503 | } |
504 | |
505 | if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE)) |
506 | { |
507 | ndata--; |
508 | heap[0]=heap[ndata]; |
509 | } |
510 | else |
511 | { |
512 | *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize; |
513 | |
514 | ReadFile(fds[heap[0]],datap[heap[0]],itemsize); |
515 | } |
516 | |
517 | /* Bubble down the new value */ |
518 | |
519 | while((2*index+2)<ndata && |
520 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
521 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
522 | { |
523 | int newindex; |
524 | int temp; |
525 | |
526 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
527 | newindex=2*index+1; |
528 | else |
529 | newindex=2*index+2; |
530 | |
531 | temp=heap[newindex]; |
532 | heap[newindex]=heap[index]; |
533 | heap[index]=temp; |
534 | |
535 | index=newindex; |
536 | } |
537 | |
538 | if((2*index+2)==ndata && |
539 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
540 | { |
541 | int newindex; |
542 | int temp; |
543 | |
544 | newindex=2*index+1; |
545 | |
546 | temp=heap[newindex]; |
547 | heap[newindex]=heap[index]; |
548 | heap[index]=temp; |
549 | } |
550 | } |
551 | while(ndata>0); |
552 | |
553 | /* Tidy up */ |
554 | |
555 | tidy_and_exit: |
556 | |
557 | if(fds) |
558 | { |
559 | for(i=0;i<nfiles;i++) |
560 | CloseFile(fds[i]); |
561 | free(fds); |
562 | } |
563 | |
564 | if(heap) |
565 | free(heap); |
566 | |
567 | free(data); |
568 | free(filename); |
569 | } |
570 | |
571 | |
572 | /*++++++++++++++++++++++++++++++++++++++ |
573 | A function to sort an array of pointers efficiently. |
574 | |
575 | The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort, |
576 | in particular an this good because it can operate in-place and doesn't |
577 | allocate more memory like using qsort() does. |
578 | |
579 | void **datap A pointer to the array of pointers to sort. |
580 | |
581 | size_t nitems The number of items of data to sort. |
582 | |
583 | int(*compare)(const void *, const void *) The comparison function (identical to qsort if the |
584 | data to be sorted was an array of things not pointers). |
585 | ++++++++++++++++++++++++++++++++++++++*/ |
586 | |
587 | void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*)) |
588 | { |
589 | int i; |
590 | |
591 | /* Fill the heap by pretending to insert the data that is already there */ |
592 | |
593 | for(i=1;i<nitems;i++) |
594 | { |
595 | int index=i; |
596 | |
597 | /* Bubble up the new value (upside-down, put largest at top) */ |
598 | |
599 | while(index>0 && |
600 | compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */ |
601 | { |
602 | int newindex; |
603 | void *temp; |
604 | |
605 | newindex=(index-1)/2; |
606 | |
607 | temp=datap[index]; |
608 | datap[index]=datap[newindex]; |
609 | datap[newindex]=temp; |
610 | |
611 | index=newindex; |
612 | } |
613 | } |
614 | |
615 | /* Repeatedly pull out the root of the heap and swap with the bottom item */ |
616 | |
617 | for(i=nitems-1;i>0;i--) |
618 | { |
619 | int index=0; |
620 | void *temp; |
621 | |
622 | temp=datap[index]; |
623 | datap[index]=datap[i]; |
624 | datap[i]=temp; |
625 | |
626 | /* Bubble down the new value (upside-down, put largest at top) */ |
627 | |
628 | while((2*index+2)<i && |
629 | (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */ |
630 | compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */ |
631 | { |
632 | int newindex; |
633 | void *temp; |
634 | |
635 | if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */ |
636 | newindex=2*index+1; |
637 | else |
638 | newindex=2*index+2; |
639 | |
640 | temp=datap[newindex]; |
641 | datap[newindex]=datap[index]; |
642 | datap[index]=temp; |
643 | |
644 | index=newindex; |
645 | } |
646 | |
647 | if((2*index+2)==i && |
648 | compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */ |
649 | { |
650 | int newindex; |
651 | void *temp; |
652 | |
653 | newindex=2*index+1; |
654 | |
655 | temp=datap[newindex]; |
656 | datap[newindex]=datap[index]; |
657 | datap[index]=temp; |
658 | } |
659 | } |
660 | } |
Properties
Name | Value |
---|---|
cvs:description | Functions to perform sorting. |