Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino
Contents of /trunk/src/sorting.c
Parent Directory
|
Revision Log
Revision 503 -
(show annotations)
(download)
(as text)
Sat Sep 25 13:54:35 2010 UTC (14 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 15840 byte(s)
Sat Sep 25 13:54:35 2010 UTC (14 years, 5 months ago) by amb
File MIME type: text/x-csrc
File size: 15840 byte(s)
Rename the heapsort() function to filesort_heapsort().
1 | /*************************************** |
2 | $Header: /home/amb/CVS/routino/src/sorting.c,v 1.11 2010-09-25 13:54:18 amb Exp $ |
3 | |
4 | Merge sort functions. |
5 | |
6 | Part of the Routino routing software. |
7 | ******************/ /****************** |
8 | This file Copyright 2009-2010 Andrew M. Bishop |
9 | |
10 | This program is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU Affero General Public License as published by |
12 | the Free Software Foundation, either version 3 of the License, or |
13 | (at your option) any later version. |
14 | |
15 | This program is distributed in the hope that it will be useful, |
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | GNU Affero General Public License for more details. |
19 | |
20 | You should have received a copy of the GNU Affero General Public License |
21 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
22 | ***************************************/ |
23 | |
24 | |
25 | #include <stdio.h> |
26 | #include <stdlib.h> |
27 | #include <string.h> |
28 | #include <assert.h> |
29 | |
30 | #include "files.h" |
31 | #include "functions.h" |
32 | |
33 | |
34 | /* Variables */ |
35 | |
36 | /*+ The command line '--tmpdir' option or its default value. +*/ |
37 | extern char *option_tmpdirname; |
38 | |
39 | /*+ The amount of RAM to use for filesorting. +*/ |
40 | extern size_t option_filesort_ramsize; |
41 | |
42 | |
43 | /*++++++++++++++++++++++++++++++++++++++ |
44 | A function to sort the contents of a file of fixed length objects using a |
45 | limited amount of RAM. |
46 | |
47 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
48 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
49 | The individual sort steps and the merge step both use a "Heap sort" |
50 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
51 | if the data is already partially sorted. |
52 | |
53 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
54 | |
55 | int fd_out The file descriptor of the output file (opened for writing and empty). |
56 | |
57 | size_t itemsize The size of each item in the file that needs sorting. |
58 | |
59 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
60 | data to be sorted is an array of things not pointers). |
61 | |
62 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
63 | returns 1 then it is written to the output file. |
64 | ++++++++++++++++++++++++++++++++++++++*/ |
65 | |
66 | void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
67 | { |
68 | int *fds=NULL,*heap=NULL; |
69 | int nfiles=0,ndata=0; |
70 | index_t count=0,total=0; |
71 | size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*)); |
72 | void *data=NULL,**datap=NULL; |
73 | char *filename; |
74 | int i,more=1; |
75 | |
76 | /* Allocate the RAM buffer and other bits */ |
77 | |
78 | data=malloc(nitems*itemsize); |
79 | datap=malloc(nitems*sizeof(void*)); |
80 | |
81 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
82 | |
83 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
84 | |
85 | do |
86 | { |
87 | int fd,n=0; |
88 | |
89 | /* Read in the data and create pointers */ |
90 | |
91 | for(i=0;i<nitems;i++) |
92 | { |
93 | datap[i]=data+i*itemsize; |
94 | |
95 | if(ReadFile(fd_in,datap[i],itemsize)) |
96 | { |
97 | more=0; |
98 | break; |
99 | } |
100 | |
101 | total++; |
102 | } |
103 | |
104 | n=i; |
105 | |
106 | if(n==0) |
107 | break; |
108 | |
109 | /* Sort the data pointers using a heap sort */ |
110 | |
111 | filesort_heapsort(datap,n,compare); |
112 | |
113 | /* Shortcut if all read in and sorted at once */ |
114 | |
115 | if(nfiles==0 && !more) |
116 | { |
117 | for(i=0;i<n;i++) |
118 | { |
119 | if(!buildindex || buildindex(datap[i],count)) |
120 | { |
121 | WriteFile(fd_out,datap[i],itemsize); |
122 | count++; |
123 | } |
124 | } |
125 | |
126 | goto tidy_and_exit; |
127 | } |
128 | |
129 | /* Create a temporary file and write the result */ |
130 | |
131 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
132 | |
133 | fd=OpenFileNew(filename); |
134 | |
135 | for(i=0;i<n;i++) |
136 | WriteFile(fd,datap[i],itemsize); |
137 | |
138 | CloseFile(fd); |
139 | |
140 | nfiles++; |
141 | } |
142 | while(more); |
143 | |
144 | /* Shortcut if only one file (unlucky for us there must have been exactly |
145 | nitems, lucky for us we still have the data in RAM) */ |
146 | |
147 | if(nfiles==1) |
148 | { |
149 | for(i=0;i<nitems;i++) |
150 | { |
151 | if(!buildindex || buildindex(datap[i],count)) |
152 | { |
153 | WriteFile(fd_out,datap[i],itemsize); |
154 | count++; |
155 | } |
156 | } |
157 | |
158 | DeleteFile(filename); |
159 | |
160 | goto tidy_and_exit; |
161 | } |
162 | |
163 | /* Check that number of files is less than file size */ |
164 | |
165 | assert(nfiles<nitems); |
166 | |
167 | /* Open all of the temporary files */ |
168 | |
169 | fds=(int*)malloc(nfiles*sizeof(int)); |
170 | |
171 | for(i=0;i<nfiles;i++) |
172 | { |
173 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
174 | |
175 | fds[i]=ReOpenFile(filename); |
176 | |
177 | DeleteFile(filename); |
178 | } |
179 | |
180 | /* Perform an n-way merge using a binary heap */ |
181 | |
182 | heap=(int*)malloc(nfiles*sizeof(int)); |
183 | |
184 | /* Fill the heap to start with */ |
185 | |
186 | for(i=0;i<nfiles;i++) |
187 | { |
188 | int index; |
189 | |
190 | datap[i]=data+i*itemsize; |
191 | |
192 | ReadFile(fds[i],datap[i],itemsize); |
193 | |
194 | heap[i]=i; |
195 | |
196 | index=i; |
197 | |
198 | /* Bubble up the new value */ |
199 | |
200 | while(index>0 && |
201 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
202 | { |
203 | int newindex; |
204 | int temp; |
205 | |
206 | newindex=(index-1)/2; |
207 | |
208 | temp=heap[index]; |
209 | heap[index]=heap[newindex]; |
210 | heap[newindex]=temp; |
211 | |
212 | index=newindex; |
213 | } |
214 | } |
215 | |
216 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
217 | |
218 | ndata=nfiles; |
219 | |
220 | do |
221 | { |
222 | int index=0; |
223 | |
224 | if(!buildindex || buildindex(datap[heap[0]],count)) |
225 | { |
226 | WriteFile(fd_out,datap[heap[0]],itemsize); |
227 | count++; |
228 | } |
229 | |
230 | if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize)) |
231 | { |
232 | ndata--; |
233 | heap[0]=heap[ndata]; |
234 | } |
235 | |
236 | /* Bubble down the new value */ |
237 | |
238 | while((2*index+2)<ndata && |
239 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
240 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
241 | { |
242 | int newindex; |
243 | int temp; |
244 | |
245 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
246 | newindex=2*index+1; |
247 | else |
248 | newindex=2*index+2; |
249 | |
250 | temp=heap[newindex]; |
251 | heap[newindex]=heap[index]; |
252 | heap[index]=temp; |
253 | |
254 | index=newindex; |
255 | } |
256 | |
257 | if((2*index+2)==ndata && |
258 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
259 | { |
260 | int newindex; |
261 | int temp; |
262 | |
263 | newindex=2*index+1; |
264 | |
265 | temp=heap[newindex]; |
266 | heap[newindex]=heap[index]; |
267 | heap[index]=temp; |
268 | } |
269 | } |
270 | while(ndata>0); |
271 | |
272 | /* Tidy up */ |
273 | |
274 | tidy_and_exit: |
275 | |
276 | if(fds) |
277 | { |
278 | for(i=0;i<nfiles;i++) |
279 | CloseFile(fds[i]); |
280 | free(fds); |
281 | } |
282 | |
283 | if(heap) |
284 | free(heap); |
285 | |
286 | free(data); |
287 | free(datap); |
288 | free(filename); |
289 | } |
290 | |
291 | |
292 | /*++++++++++++++++++++++++++++++++++++++ |
293 | A function to sort the contents of a file of variable length objects (each |
294 | preceded by its length in 2 bytes) using a limited amount of RAM. |
295 | |
296 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
297 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
298 | The individual sort steps and the merge step both use a "Heap sort" |
299 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
300 | if the data is already partially sorted. |
301 | |
302 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
303 | |
304 | int fd_out The file descriptor of the output file (opened for writing and empty). |
305 | |
306 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
307 | data to be sorted is an array of things not pointers). |
308 | |
309 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
310 | returns 1 then it is written to the output file. |
311 | ++++++++++++++++++++++++++++++++++++++*/ |
312 | |
313 | void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
314 | { |
315 | int *fds=NULL,*heap=NULL; |
316 | int nfiles=0,ndata=0; |
317 | index_t count=0,total=0; |
318 | FILESORT_VARINT nextitemsize,largestitemsize=0; |
319 | void *data=NULL,**datap=NULL; |
320 | char *filename; |
321 | int i,more=1; |
322 | |
323 | /* Allocate the RAM buffer and other bits */ |
324 | |
325 | data=malloc(option_filesort_ramsize); |
326 | |
327 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
328 | |
329 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
330 | |
331 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */ |
332 | goto tidy_and_exit; |
333 | |
334 | do |
335 | { |
336 | int fd,n=0; |
337 | size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE; |
338 | |
339 | datap=data+option_filesort_ramsize; |
340 | |
341 | /* Read in the data and create pointers */ |
342 | |
343 | while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data)) |
344 | { |
345 | FILESORT_VARINT itemsize=nextitemsize; |
346 | |
347 | if(itemsize>largestitemsize) |
348 | largestitemsize=itemsize; |
349 | |
350 | *(FILESORT_VARINT*)(data+ramused)=itemsize; |
351 | |
352 | ramused+=FILESORT_VARSIZE; |
353 | |
354 | ReadFile(fd_in,data+ramused,itemsize); |
355 | |
356 | *--datap=data+ramused; /* points to real data */ |
357 | |
358 | ramused+=itemsize; |
359 | |
360 | ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN); |
361 | ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE; |
362 | |
363 | total++; |
364 | n++; |
365 | |
366 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) |
367 | { |
368 | more=0; |
369 | break; |
370 | } |
371 | } |
372 | |
373 | if(n==0) |
374 | break; |
375 | |
376 | /* Sort the data pointers using a heap sort */ |
377 | |
378 | filesort_heapsort(datap,n,compare); |
379 | |
380 | /* Shortcut if all read in and sorted at once */ |
381 | |
382 | if(nfiles==0 && !more) |
383 | { |
384 | for(i=0;i<n;i++) |
385 | { |
386 | if(!buildindex || buildindex(datap[i],count)) |
387 | { |
388 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
389 | |
390 | WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
391 | count++; |
392 | } |
393 | } |
394 | |
395 | goto tidy_and_exit; |
396 | } |
397 | |
398 | /* Create a temporary file and write the result */ |
399 | |
400 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
401 | |
402 | fd=OpenFileNew(filename); |
403 | |
404 | for(i=0;i<n;i++) |
405 | { |
406 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
407 | |
408 | WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
409 | } |
410 | |
411 | CloseFile(fd); |
412 | |
413 | nfiles++; |
414 | } |
415 | while(more); |
416 | |
417 | /* Check that number of files is less than file size */ |
418 | |
419 | largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN); |
420 | |
421 | assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize)); |
422 | |
423 | /* Open all of the temporary files */ |
424 | |
425 | fds=(int*)malloc(nfiles*sizeof(int)); |
426 | |
427 | for(i=0;i<nfiles;i++) |
428 | { |
429 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
430 | |
431 | fds[i]=ReOpenFile(filename); |
432 | |
433 | DeleteFile(filename); |
434 | } |
435 | |
436 | /* Perform an n-way merge using a binary heap */ |
437 | |
438 | heap=(int*)malloc(nfiles*sizeof(int)); |
439 | |
440 | datap=data+option_filesort_ramsize-nfiles*sizeof(void*); |
441 | |
442 | /* Fill the heap to start with */ |
443 | |
444 | for(i=0;i<nfiles;i++) |
445 | { |
446 | int index; |
447 | FILESORT_VARINT itemsize; |
448 | |
449 | datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize; |
450 | |
451 | ReadFile(fds[i],&itemsize,FILESORT_VARSIZE); |
452 | |
453 | *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize; |
454 | |
455 | ReadFile(fds[i],datap[i],itemsize); |
456 | |
457 | heap[i]=i; |
458 | |
459 | index=i; |
460 | |
461 | /* Bubble up the new value */ |
462 | |
463 | while(index>0 && |
464 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
465 | { |
466 | int newindex; |
467 | int temp; |
468 | |
469 | newindex=(index-1)/2; |
470 | |
471 | temp=heap[index]; |
472 | heap[index]=heap[newindex]; |
473 | heap[newindex]=temp; |
474 | |
475 | index=newindex; |
476 | } |
477 | } |
478 | |
479 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
480 | |
481 | ndata=nfiles; |
482 | |
483 | do |
484 | { |
485 | int index=0; |
486 | FILESORT_VARINT itemsize; |
487 | |
488 | if(!buildindex || buildindex(datap[heap[0]],count)) |
489 | { |
490 | itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE); |
491 | |
492 | WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
493 | count++; |
494 | } |
495 | |
496 | if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE)) |
497 | { |
498 | ndata--; |
499 | heap[0]=heap[ndata]; |
500 | } |
501 | else |
502 | { |
503 | *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize; |
504 | |
505 | ReadFile(fds[heap[0]],datap[heap[0]],itemsize); |
506 | } |
507 | |
508 | /* Bubble down the new value */ |
509 | |
510 | while((2*index+2)<ndata && |
511 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
512 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
513 | { |
514 | int newindex; |
515 | int temp; |
516 | |
517 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
518 | newindex=2*index+1; |
519 | else |
520 | newindex=2*index+2; |
521 | |
522 | temp=heap[newindex]; |
523 | heap[newindex]=heap[index]; |
524 | heap[index]=temp; |
525 | |
526 | index=newindex; |
527 | } |
528 | |
529 | if((2*index+2)==ndata && |
530 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
531 | { |
532 | int newindex; |
533 | int temp; |
534 | |
535 | newindex=2*index+1; |
536 | |
537 | temp=heap[newindex]; |
538 | heap[newindex]=heap[index]; |
539 | heap[index]=temp; |
540 | } |
541 | } |
542 | while(ndata>0); |
543 | |
544 | /* Tidy up */ |
545 | |
546 | tidy_and_exit: |
547 | |
548 | if(fds) |
549 | { |
550 | for(i=0;i<nfiles;i++) |
551 | CloseFile(fds[i]); |
552 | free(fds); |
553 | } |
554 | |
555 | if(heap) |
556 | free(heap); |
557 | |
558 | free(data); |
559 | free(filename); |
560 | } |
561 | |
562 | |
563 | /*++++++++++++++++++++++++++++++++++++++ |
564 | A function to sort an array of pointers efficiently. |
565 | |
566 | The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort, |
567 | in particular an this good because it can operate in-place and doesn't |
568 | allocate more memory like using qsort() does. |
569 | |
570 | void **datap A pointer to the array of pointers to sort. |
571 | |
572 | size_t nitems The number of items of data to sort. |
573 | |
574 | int(*compare)(const void *, const void *) The comparison function (identical to qsort if the |
575 | data to be sorted was an array of things not pointers). |
576 | ++++++++++++++++++++++++++++++++++++++*/ |
577 | |
578 | void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*)) |
579 | { |
580 | int i; |
581 | |
582 | /* Fill the heap by pretending to insert the data that is already there */ |
583 | |
584 | for(i=1;i<nitems;i++) |
585 | { |
586 | int index=i; |
587 | |
588 | /* Bubble up the new value (upside-down, put largest at top) */ |
589 | |
590 | while(index>0 && |
591 | compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */ |
592 | { |
593 | int newindex; |
594 | void *temp; |
595 | |
596 | newindex=(index-1)/2; |
597 | |
598 | temp=datap[index]; |
599 | datap[index]=datap[newindex]; |
600 | datap[newindex]=temp; |
601 | |
602 | index=newindex; |
603 | } |
604 | } |
605 | |
606 | /* Repeatedly pull out the root of the heap and swap with the bottom item */ |
607 | |
608 | for(i=nitems-1;i>0;i--) |
609 | { |
610 | int index=0; |
611 | void *temp; |
612 | |
613 | temp=datap[index]; |
614 | datap[index]=datap[i]; |
615 | datap[i]=temp; |
616 | |
617 | /* Bubble down the new value (upside-down, put largest at top) */ |
618 | |
619 | while((2*index+2)<i && |
620 | (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */ |
621 | compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */ |
622 | { |
623 | int newindex; |
624 | void *temp; |
625 | |
626 | if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */ |
627 | newindex=2*index+1; |
628 | else |
629 | newindex=2*index+2; |
630 | |
631 | temp=datap[newindex]; |
632 | datap[newindex]=datap[index]; |
633 | datap[index]=temp; |
634 | |
635 | index=newindex; |
636 | } |
637 | |
638 | if((2*index+2)==i && |
639 | compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */ |
640 | { |
641 | int newindex; |
642 | void *temp; |
643 | |
644 | newindex=2*index+1; |
645 | |
646 | temp=datap[newindex]; |
647 | datap[newindex]=datap[index]; |
648 | datap[index]=temp; |
649 | } |
650 | } |
651 | } |
Properties
Name | Value |
---|---|
cvs:description | Functions to perform sorting. |