Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino
Contents of /trunk/src/sorting.c
Parent Directory
|
Revision Log
Revision 875 -
(show annotations)
(download)
(as text)
Sat Oct 22 16:27:29 2011 UTC (13 years, 4 months ago) by amb
File MIME type: text/x-csrc
File size: 16165 byte(s)
Sat Oct 22 16:27:29 2011 UTC (13 years, 4 months ago) by amb
File MIME type: text/x-csrc
File size: 16165 byte(s)
Revert back to something very close to r869 because it is fastest by a tiny fraction.
1 | /*************************************** |
2 | Merge sort functions. |
3 | |
4 | Part of the Routino routing software. |
5 | ******************/ /****************** |
6 | This file Copyright 2009-2011 Andrew M. Bishop |
7 | |
8 | This program is free software: you can redistribute it and/or modify |
9 | it under the terms of the GNU Affero General Public License as published by |
10 | the Free Software Foundation, either version 3 of the License, or |
11 | (at your option) any later version. |
12 | |
13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU Affero General Public License for more details. |
17 | |
18 | You should have received a copy of the GNU Affero General Public License |
19 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | ***************************************/ |
21 | |
22 | |
23 | #include <stdio.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <assert.h> |
27 | |
28 | #include "types.h" |
29 | |
30 | #include "files.h" |
31 | #include "sorting.h" |
32 | |
33 | |
34 | /* Global variables */ |
35 | |
36 | /*+ The command line '--tmpdir' option or its default value. +*/ |
37 | extern char *option_tmpdirname; |
38 | |
39 | /*+ The amount of RAM to use for filesorting. +*/ |
40 | extern size_t option_filesort_ramsize; |
41 | |
42 | |
43 | /*++++++++++++++++++++++++++++++++++++++ |
44 | A function to sort the contents of a file of fixed length objects using a |
45 | limited amount of RAM. |
46 | |
47 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
48 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
49 | The individual sort steps and the merge step both use a "Heap sort" |
50 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
51 | if the data is already partially sorted. |
52 | |
53 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
54 | |
55 | int fd_out The file descriptor of the output file (opened for writing and empty). |
56 | |
57 | size_t itemsize The size of each item in the file that needs sorting. |
58 | |
59 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
60 | data to be sorted is an array of things not pointers). |
61 | |
62 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
63 | returns 1 then it is written to the output file. |
64 | ++++++++++++++++++++++++++++++++++++++*/ |
65 | |
66 | void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
67 | { |
68 | int *fds=NULL,*heap=NULL; |
69 | int nfiles=0,ndata=0; |
70 | index_t count=0,total=0; |
71 | size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*)); |
72 | void *data=NULL,**datap=NULL; |
73 | char *filename; |
74 | int i,more=1; |
75 | |
76 | /* Allocate the RAM buffer and other bits */ |
77 | |
78 | data=malloc(nitems*itemsize); |
79 | datap=malloc(nitems*sizeof(void*)); |
80 | |
81 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
82 | |
83 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
84 | |
85 | do |
86 | { |
87 | int fd,n=0; |
88 | |
89 | /* Read in the data and create pointers */ |
90 | |
91 | for(i=0;i<nitems;i++) |
92 | { |
93 | datap[i]=data+i*itemsize; |
94 | |
95 | if(ReadFile(fd_in,datap[i],itemsize)) |
96 | { |
97 | more=0; |
98 | break; |
99 | } |
100 | |
101 | total++; |
102 | } |
103 | |
104 | n=i; |
105 | |
106 | /* Shortcut if there is no data and no previous files (i.e. no data at all) */ |
107 | |
108 | if(nfiles==0 && n==0) |
109 | goto tidy_and_exit; |
110 | |
111 | /* No new data read in this time round */ |
112 | |
113 | if(n==0) |
114 | break; |
115 | |
116 | /* Sort the data pointers using a heap sort */ |
117 | |
118 | filesort_heapsort(datap,n,compare); |
119 | |
120 | /* Shortcut if all read in and sorted at once */ |
121 | |
122 | if(nfiles==0 && !more) |
123 | { |
124 | for(i=0;i<n;i++) |
125 | { |
126 | if(!buildindex || buildindex(datap[i],count)) |
127 | { |
128 | WriteFile(fd_out,datap[i],itemsize); |
129 | count++; |
130 | } |
131 | } |
132 | |
133 | goto tidy_and_exit; |
134 | } |
135 | |
136 | /* Create a temporary file and write the result */ |
137 | |
138 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
139 | |
140 | fd=OpenFileNew(filename); |
141 | |
142 | for(i=0;i<n;i++) |
143 | WriteFile(fd,datap[i],itemsize); |
144 | |
145 | CloseFile(fd); |
146 | |
147 | nfiles++; |
148 | } |
149 | while(more); |
150 | |
151 | /* Shortcut if only one file (unlucky for us there must have been exactly |
152 | nitems, lucky for us we still have the data in RAM) */ |
153 | |
154 | if(nfiles==1) |
155 | { |
156 | for(i=0;i<nitems;i++) |
157 | { |
158 | if(!buildindex || buildindex(datap[i],count)) |
159 | { |
160 | WriteFile(fd_out,datap[i],itemsize); |
161 | count++; |
162 | } |
163 | } |
164 | |
165 | DeleteFile(filename); |
166 | |
167 | goto tidy_and_exit; |
168 | } |
169 | |
170 | /* Check that number of files is less than file size */ |
171 | |
172 | assert(nfiles<nitems); |
173 | |
174 | /* Open all of the temporary files */ |
175 | |
176 | fds=(int*)malloc(nfiles*sizeof(int)); |
177 | |
178 | for(i=0;i<nfiles;i++) |
179 | { |
180 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
181 | |
182 | fds[i]=ReOpenFile(filename); |
183 | |
184 | DeleteFile(filename); |
185 | } |
186 | |
187 | /* Perform an n-way merge using a binary heap */ |
188 | |
189 | heap=(int*)malloc((1+nfiles)*sizeof(int)); |
190 | |
191 | /* Fill the heap to start with */ |
192 | |
193 | for(i=0;i<nfiles;i++) |
194 | { |
195 | int index; |
196 | |
197 | datap[i]=data+i*itemsize; |
198 | |
199 | ReadFile(fds[i],datap[i],itemsize); |
200 | |
201 | index=i+1; |
202 | |
203 | heap[index]=i; |
204 | |
205 | /* Bubble up the new value */ |
206 | |
207 | while(index>1) |
208 | { |
209 | int newindex; |
210 | int temp; |
211 | |
212 | newindex=index/2; |
213 | |
214 | if(compare(datap[heap[index]],datap[heap[newindex]])>=0) |
215 | break; |
216 | |
217 | temp=heap[index]; |
218 | heap[index]=heap[newindex]; |
219 | heap[newindex]=temp; |
220 | |
221 | index=newindex; |
222 | } |
223 | } |
224 | |
225 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
226 | |
227 | ndata=nfiles; |
228 | |
229 | do |
230 | { |
231 | int index=1; |
232 | |
233 | if(!buildindex || buildindex(datap[heap[index]],count)) |
234 | { |
235 | WriteFile(fd_out,datap[heap[index]],itemsize); |
236 | count++; |
237 | } |
238 | |
239 | if(ReadFile(fds[heap[index]],datap[heap[index]],itemsize)) |
240 | { |
241 | heap[index]=heap[ndata]; |
242 | ndata--; |
243 | } |
244 | |
245 | /* Bubble down the new value */ |
246 | |
247 | while((2*index)<ndata) |
248 | { |
249 | int newindex; |
250 | int temp; |
251 | |
252 | newindex=2*index; |
253 | |
254 | if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>=0) |
255 | newindex=newindex+1; |
256 | |
257 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
258 | break; |
259 | |
260 | temp=heap[newindex]; |
261 | heap[newindex]=heap[index]; |
262 | heap[index]=temp; |
263 | |
264 | index=newindex; |
265 | } |
266 | |
267 | if((2*index)==ndata) |
268 | { |
269 | int newindex; |
270 | int temp; |
271 | |
272 | newindex=2*index; |
273 | |
274 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
275 | ; /* break */ |
276 | else |
277 | { |
278 | temp=heap[newindex]; |
279 | heap[newindex]=heap[index]; |
280 | heap[index]=temp; |
281 | } |
282 | } |
283 | } |
284 | while(ndata>0); |
285 | |
286 | /* Tidy up */ |
287 | |
288 | tidy_and_exit: |
289 | |
290 | if(fds) |
291 | { |
292 | for(i=0;i<nfiles;i++) |
293 | CloseFile(fds[i]); |
294 | free(fds); |
295 | } |
296 | |
297 | if(heap) |
298 | free(heap); |
299 | |
300 | free(data); |
301 | free(datap); |
302 | free(filename); |
303 | } |
304 | |
305 | |
306 | /*++++++++++++++++++++++++++++++++++++++ |
307 | A function to sort the contents of a file of variable length objects (each |
308 | preceded by its length in FILESORT_VARSIZE bytes) using a limited amount of RAM. |
309 | |
310 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
311 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
312 | The individual sort steps and the merge step both use a "Heap sort" |
313 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
314 | if the data is already partially sorted. |
315 | |
316 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
317 | |
318 | int fd_out The file descriptor of the output file (opened for writing and empty). |
319 | |
320 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
321 | data to be sorted is an array of things not pointers). |
322 | |
323 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
324 | returns 1 then it is written to the output file. |
325 | ++++++++++++++++++++++++++++++++++++++*/ |
326 | |
327 | void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
328 | { |
329 | int *fds=NULL,*heap=NULL; |
330 | int nfiles=0,ndata=0; |
331 | index_t count=0,total=0; |
332 | FILESORT_VARINT nextitemsize,largestitemsize=0; |
333 | void *data=NULL,**datap=NULL; |
334 | char *filename; |
335 | int i,more=1; |
336 | |
337 | /* Allocate the RAM buffer and other bits */ |
338 | |
339 | data=malloc(option_filesort_ramsize); |
340 | |
341 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
342 | |
343 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
344 | |
345 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */ |
346 | goto tidy_and_exit; |
347 | |
348 | do |
349 | { |
350 | int fd,n=0; |
351 | size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE; |
352 | |
353 | datap=data+option_filesort_ramsize; |
354 | |
355 | /* Read in the data and create pointers */ |
356 | |
357 | while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data)) |
358 | { |
359 | FILESORT_VARINT itemsize=nextitemsize; |
360 | |
361 | if(itemsize>largestitemsize) |
362 | largestitemsize=itemsize; |
363 | |
364 | *(FILESORT_VARINT*)(data+ramused)=itemsize; |
365 | |
366 | ramused+=FILESORT_VARSIZE; |
367 | |
368 | ReadFile(fd_in,data+ramused,itemsize); |
369 | |
370 | *--datap=data+ramused; /* points to real data */ |
371 | |
372 | ramused+=itemsize; |
373 | |
374 | ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN); |
375 | ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE; |
376 | |
377 | total++; |
378 | n++; |
379 | |
380 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) |
381 | { |
382 | more=0; |
383 | break; |
384 | } |
385 | } |
386 | |
387 | /* No new data read in this time round */ |
388 | |
389 | if(n==0) |
390 | break; |
391 | |
392 | /* Sort the data pointers using a heap sort */ |
393 | |
394 | filesort_heapsort(datap,n,compare); |
395 | |
396 | /* Shortcut if all read in and sorted at once */ |
397 | |
398 | if(nfiles==0 && !more) |
399 | { |
400 | for(i=0;i<n;i++) |
401 | { |
402 | if(!buildindex || buildindex(datap[i],count)) |
403 | { |
404 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
405 | |
406 | WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
407 | count++; |
408 | } |
409 | } |
410 | |
411 | goto tidy_and_exit; |
412 | } |
413 | |
414 | /* Create a temporary file and write the result */ |
415 | |
416 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
417 | |
418 | fd=OpenFileNew(filename); |
419 | |
420 | for(i=0;i<n;i++) |
421 | { |
422 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
423 | |
424 | WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
425 | } |
426 | |
427 | CloseFile(fd); |
428 | |
429 | nfiles++; |
430 | } |
431 | while(more); |
432 | |
433 | /* Check that number of files is less than file size */ |
434 | |
435 | largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN); |
436 | |
437 | assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize)); |
438 | |
439 | /* Open all of the temporary files */ |
440 | |
441 | fds=(int*)malloc(nfiles*sizeof(int)); |
442 | |
443 | for(i=0;i<nfiles;i++) |
444 | { |
445 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
446 | |
447 | fds[i]=ReOpenFile(filename); |
448 | |
449 | DeleteFile(filename); |
450 | } |
451 | |
452 | /* Perform an n-way merge using a binary heap */ |
453 | |
454 | heap=(int*)malloc((1+nfiles)*sizeof(int)); |
455 | |
456 | datap=data+option_filesort_ramsize-nfiles*sizeof(void*); |
457 | |
458 | /* Fill the heap to start with */ |
459 | |
460 | for(i=0;i<nfiles;i++) |
461 | { |
462 | int index; |
463 | FILESORT_VARINT itemsize; |
464 | |
465 | datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize; |
466 | |
467 | ReadFile(fds[i],&itemsize,FILESORT_VARSIZE); |
468 | |
469 | *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize; |
470 | |
471 | ReadFile(fds[i],datap[i],itemsize); |
472 | |
473 | index=i+1; |
474 | |
475 | heap[index]=i; |
476 | |
477 | /* Bubble up the new value */ |
478 | |
479 | while(index>1) |
480 | { |
481 | int newindex; |
482 | int temp; |
483 | |
484 | newindex=index/2; |
485 | |
486 | if(compare(datap[heap[index]],datap[heap[newindex]])>=0) |
487 | break; |
488 | |
489 | temp=heap[index]; |
490 | heap[index]=heap[newindex]; |
491 | heap[newindex]=temp; |
492 | |
493 | index=newindex; |
494 | } |
495 | } |
496 | |
497 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
498 | |
499 | ndata=nfiles; |
500 | |
501 | do |
502 | { |
503 | int index=1; |
504 | FILESORT_VARINT itemsize; |
505 | |
506 | if(!buildindex || buildindex(datap[heap[index]],count)) |
507 | { |
508 | itemsize=*(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE); |
509 | |
510 | WriteFile(fd_out,datap[heap[index]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
511 | count++; |
512 | } |
513 | |
514 | if(ReadFile(fds[heap[index]],&itemsize,FILESORT_VARSIZE)) |
515 | { |
516 | heap[index]=heap[ndata]; |
517 | ndata--; |
518 | } |
519 | else |
520 | { |
521 | *(FILESORT_VARINT*)(datap[heap[index]]-FILESORT_VARSIZE)=itemsize; |
522 | |
523 | ReadFile(fds[heap[index]],datap[heap[index]],itemsize); |
524 | } |
525 | |
526 | /* Bubble down the new value */ |
527 | |
528 | while((2*index)<ndata) |
529 | { |
530 | int newindex; |
531 | int temp; |
532 | |
533 | newindex=2*index; |
534 | |
535 | if(compare(datap[heap[newindex]],datap[heap[newindex+1]])>=0) |
536 | newindex=newindex+1; |
537 | |
538 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
539 | break; |
540 | |
541 | temp=heap[newindex]; |
542 | heap[newindex]=heap[index]; |
543 | heap[index]=temp; |
544 | |
545 | index=newindex; |
546 | } |
547 | |
548 | if((2*index)==ndata) |
549 | { |
550 | int newindex; |
551 | int temp; |
552 | |
553 | newindex=2*index; |
554 | |
555 | if(compare(datap[heap[index]],datap[heap[newindex]])<=0) |
556 | ; /* break */ |
557 | else |
558 | { |
559 | temp=heap[newindex]; |
560 | heap[newindex]=heap[index]; |
561 | heap[index]=temp; |
562 | } |
563 | } |
564 | } |
565 | while(ndata>0); |
566 | |
567 | /* Tidy up */ |
568 | |
569 | tidy_and_exit: |
570 | |
571 | if(fds) |
572 | { |
573 | for(i=0;i<nfiles;i++) |
574 | CloseFile(fds[i]); |
575 | free(fds); |
576 | } |
577 | |
578 | if(heap) |
579 | free(heap); |
580 | |
581 | free(data); |
582 | free(filename); |
583 | } |
584 | |
585 | |
586 | /*++++++++++++++++++++++++++++++++++++++ |
587 | A function to sort an array of pointers efficiently. |
588 | |
589 | The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort, |
590 | in particular, this is good because it can operate in-place and doesn't |
591 | allocate more memory like using qsort() does. |
592 | |
593 | void **datap A pointer to the array of pointers to sort. |
594 | |
595 | size_t nitems The number of items of data to sort. |
596 | |
597 | int(*compare)(const void *, const void *) The comparison function (identical to qsort if the |
598 | data to be sorted was an array of things not pointers). |
599 | ++++++++++++++++++++++++++++++++++++++*/ |
600 | |
601 | void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*)) |
602 | { |
603 | void **datap1=&datap[-1]; |
604 | int i; |
605 | |
606 | /* Fill the heap by pretending to insert the data that is already there */ |
607 | |
608 | for(i=2;i<=nitems;i++) |
609 | { |
610 | int index=i; |
611 | |
612 | /* Bubble up the new value (upside-down, put largest at top) */ |
613 | |
614 | while(index>1) |
615 | { |
616 | int newindex; |
617 | void *temp; |
618 | |
619 | newindex=index/2; |
620 | |
621 | if(compare(datap1[index],datap1[newindex])<=0) /* reversed compared to filesort_fixed() above */ |
622 | break; |
623 | |
624 | temp=datap1[index]; |
625 | datap1[index]=datap1[newindex]; |
626 | datap1[newindex]=temp; |
627 | |
628 | index=newindex; |
629 | } |
630 | } |
631 | |
632 | /* Repeatedly pull out the root of the heap and swap with the bottom item */ |
633 | |
634 | for(i=nitems;i>1;i--) |
635 | { |
636 | int index=1; |
637 | void *temp; |
638 | |
639 | temp=datap1[index]; |
640 | datap1[index]=datap1[i]; |
641 | datap1[i]=temp; |
642 | |
643 | /* Bubble down the new value (upside-down, put largest at top) */ |
644 | |
645 | while((2*index)<(i-1)) |
646 | { |
647 | int newindex; |
648 | void *temp; |
649 | |
650 | newindex=2*index; |
651 | |
652 | if(compare(datap1[newindex],datap1[newindex+1])<=0) /* reversed compared to filesort_fixed() above */ |
653 | newindex=newindex+1; |
654 | |
655 | if(compare(datap1[index],datap1[newindex])>=0) /* reversed compared to filesort_fixed() above */ |
656 | break; |
657 | |
658 | temp=datap1[newindex]; |
659 | datap1[newindex]=datap1[index]; |
660 | datap1[index]=temp; |
661 | |
662 | index=newindex; |
663 | } |
664 | |
665 | if((2*index)==(i-1)) |
666 | { |
667 | int newindex; |
668 | void *temp; |
669 | |
670 | newindex=2*index; |
671 | |
672 | if(compare(datap1[index],datap1[newindex])>=0) /* reversed compared to filesort_fixed() above */ |
673 | ; /* break */ |
674 | else |
675 | { |
676 | temp=datap1[newindex]; |
677 | datap1[newindex]=datap1[index]; |
678 | datap1[index]=temp; |
679 | } |
680 | } |
681 | } |
682 | } |
Properties
Name | Value |
---|---|
cvs:description | Functions to perform sorting. |