Check out the latest version of Routino: svn co http://routino.org/svn/trunk routino
Contents of /trunk/src/sorting.c
Parent Directory
|
Revision Log
Revision 543 -
(show annotations)
(download)
(as text)
Sat Dec 18 19:12:03 2010 UTC (14 years, 3 months ago) by amb
File MIME type: text/x-csrc
File size: 16070 byte(s)
Sat Dec 18 19:12:03 2010 UTC (14 years, 3 months ago) by amb
File MIME type: text/x-csrc
File size: 16070 byte(s)
Handle the case where there is no data in the file.
1 | /*************************************** |
2 | $Header: /home/amb/CVS/routino/src/sorting.c,v 1.13 2010-12-18 19:12:03 amb Exp $ |
3 | |
4 | Merge sort functions. |
5 | |
6 | Part of the Routino routing software. |
7 | ******************/ /****************** |
8 | This file Copyright 2009-2010 Andrew M. Bishop |
9 | |
10 | This program is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU Affero General Public License as published by |
12 | the Free Software Foundation, either version 3 of the License, or |
13 | (at your option) any later version. |
14 | |
15 | This program is distributed in the hope that it will be useful, |
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | GNU Affero General Public License for more details. |
19 | |
20 | You should have received a copy of the GNU Affero General Public License |
21 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
22 | ***************************************/ |
23 | |
24 | |
25 | #include <stdio.h> |
26 | #include <stdlib.h> |
27 | #include <string.h> |
28 | #include <assert.h> |
29 | |
30 | #include "types.h" |
31 | |
32 | #include "files.h" |
33 | #include "sorting.h" |
34 | |
35 | |
36 | /* Variables */ |
37 | |
38 | /*+ The command line '--tmpdir' option or its default value. +*/ |
39 | extern char *option_tmpdirname; |
40 | |
41 | /*+ The amount of RAM to use for filesorting. +*/ |
42 | extern size_t option_filesort_ramsize; |
43 | |
44 | |
45 | /*++++++++++++++++++++++++++++++++++++++ |
46 | A function to sort the contents of a file of fixed length objects using a |
47 | limited amount of RAM. |
48 | |
49 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
50 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
51 | The individual sort steps and the merge step both use a "Heap sort" |
52 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
53 | if the data is already partially sorted. |
54 | |
55 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
56 | |
57 | int fd_out The file descriptor of the output file (opened for writing and empty). |
58 | |
59 | size_t itemsize The size of each item in the file that needs sorting. |
60 | |
61 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
62 | data to be sorted is an array of things not pointers). |
63 | |
64 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
65 | returns 1 then it is written to the output file. |
66 | ++++++++++++++++++++++++++++++++++++++*/ |
67 | |
68 | void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
69 | { |
70 | int *fds=NULL,*heap=NULL; |
71 | int nfiles=0,ndata=0; |
72 | index_t count=0,total=0; |
73 | size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*)); |
74 | void *data=NULL,**datap=NULL; |
75 | char *filename; |
76 | int i,more=1; |
77 | |
78 | /* Allocate the RAM buffer and other bits */ |
79 | |
80 | data=malloc(nitems*itemsize); |
81 | datap=malloc(nitems*sizeof(void*)); |
82 | |
83 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
84 | |
85 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
86 | |
87 | do |
88 | { |
89 | int fd,n=0; |
90 | |
91 | /* Read in the data and create pointers */ |
92 | |
93 | for(i=0;i<nitems;i++) |
94 | { |
95 | datap[i]=data+i*itemsize; |
96 | |
97 | if(ReadFile(fd_in,datap[i],itemsize)) |
98 | { |
99 | more=0; |
100 | break; |
101 | } |
102 | |
103 | total++; |
104 | } |
105 | |
106 | n=i; |
107 | |
108 | /* No new data read in this time round */ |
109 | |
110 | if(n==0) |
111 | break; |
112 | |
113 | /* Shortcut if there are no files (i.e. no data at all) */ |
114 | |
115 | if(nfiles==0 && n==0) |
116 | goto tidy_and_exit; |
117 | |
118 | /* Sort the data pointers using a heap sort */ |
119 | |
120 | filesort_heapsort(datap,n,compare); |
121 | |
122 | /* Shortcut if all read in and sorted at once */ |
123 | |
124 | if(nfiles==0 && !more) |
125 | { |
126 | for(i=0;i<n;i++) |
127 | { |
128 | if(!buildindex || buildindex(datap[i],count)) |
129 | { |
130 | WriteFile(fd_out,datap[i],itemsize); |
131 | count++; |
132 | } |
133 | } |
134 | |
135 | goto tidy_and_exit; |
136 | } |
137 | |
138 | /* Create a temporary file and write the result */ |
139 | |
140 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
141 | |
142 | fd=OpenFileNew(filename); |
143 | |
144 | for(i=0;i<n;i++) |
145 | WriteFile(fd,datap[i],itemsize); |
146 | |
147 | CloseFile(fd); |
148 | |
149 | nfiles++; |
150 | } |
151 | while(more); |
152 | |
153 | /* Shortcut if only one file (unlucky for us there must have been exactly |
154 | nitems, lucky for us we still have the data in RAM) */ |
155 | |
156 | if(nfiles==1) |
157 | { |
158 | for(i=0;i<nitems;i++) |
159 | { |
160 | if(!buildindex || buildindex(datap[i],count)) |
161 | { |
162 | WriteFile(fd_out,datap[i],itemsize); |
163 | count++; |
164 | } |
165 | } |
166 | |
167 | DeleteFile(filename); |
168 | |
169 | goto tidy_and_exit; |
170 | } |
171 | |
172 | /* Check that number of files is less than file size */ |
173 | |
174 | assert(nfiles<nitems); |
175 | |
176 | /* Open all of the temporary files */ |
177 | |
178 | fds=(int*)malloc(nfiles*sizeof(int)); |
179 | |
180 | for(i=0;i<nfiles;i++) |
181 | { |
182 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
183 | |
184 | fds[i]=ReOpenFile(filename); |
185 | |
186 | DeleteFile(filename); |
187 | } |
188 | |
189 | /* Perform an n-way merge using a binary heap */ |
190 | |
191 | heap=(int*)malloc(nfiles*sizeof(int)); |
192 | |
193 | /* Fill the heap to start with */ |
194 | |
195 | for(i=0;i<nfiles;i++) |
196 | { |
197 | int index; |
198 | |
199 | datap[i]=data+i*itemsize; |
200 | |
201 | ReadFile(fds[i],datap[i],itemsize); |
202 | |
203 | heap[i]=i; |
204 | |
205 | index=i; |
206 | |
207 | /* Bubble up the new value */ |
208 | |
209 | while(index>0 && |
210 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
211 | { |
212 | int newindex; |
213 | int temp; |
214 | |
215 | newindex=(index-1)/2; |
216 | |
217 | temp=heap[index]; |
218 | heap[index]=heap[newindex]; |
219 | heap[newindex]=temp; |
220 | |
221 | index=newindex; |
222 | } |
223 | } |
224 | |
225 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
226 | |
227 | ndata=nfiles; |
228 | |
229 | do |
230 | { |
231 | int index=0; |
232 | |
233 | if(!buildindex || buildindex(datap[heap[0]],count)) |
234 | { |
235 | WriteFile(fd_out,datap[heap[0]],itemsize); |
236 | count++; |
237 | } |
238 | |
239 | if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize)) |
240 | { |
241 | ndata--; |
242 | heap[0]=heap[ndata]; |
243 | } |
244 | |
245 | /* Bubble down the new value */ |
246 | |
247 | while((2*index+2)<ndata && |
248 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
249 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
250 | { |
251 | int newindex; |
252 | int temp; |
253 | |
254 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
255 | newindex=2*index+1; |
256 | else |
257 | newindex=2*index+2; |
258 | |
259 | temp=heap[newindex]; |
260 | heap[newindex]=heap[index]; |
261 | heap[index]=temp; |
262 | |
263 | index=newindex; |
264 | } |
265 | |
266 | if((2*index+2)==ndata && |
267 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
268 | { |
269 | int newindex; |
270 | int temp; |
271 | |
272 | newindex=2*index+1; |
273 | |
274 | temp=heap[newindex]; |
275 | heap[newindex]=heap[index]; |
276 | heap[index]=temp; |
277 | } |
278 | } |
279 | while(ndata>0); |
280 | |
281 | /* Tidy up */ |
282 | |
283 | tidy_and_exit: |
284 | |
285 | if(fds) |
286 | { |
287 | for(i=0;i<nfiles;i++) |
288 | CloseFile(fds[i]); |
289 | free(fds); |
290 | } |
291 | |
292 | if(heap) |
293 | free(heap); |
294 | |
295 | free(data); |
296 | free(datap); |
297 | free(filename); |
298 | } |
299 | |
300 | |
301 | /*++++++++++++++++++++++++++++++++++++++ |
302 | A function to sort the contents of a file of variable length objects (each |
303 | preceded by its length in 2 bytes) using a limited amount of RAM. |
304 | |
305 | The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort |
306 | and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting. |
307 | The individual sort steps and the merge step both use a "Heap sort" |
308 | http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well |
309 | if the data is already partially sorted. |
310 | |
311 | int fd_in The file descriptor of the input file (opened for reading and at the beginning). |
312 | |
313 | int fd_out The file descriptor of the output file (opened for writing and empty). |
314 | |
315 | int (*compare)(const void*, const void*) The comparison function (identical to qsort if the |
316 | data to be sorted is an array of things not pointers). |
317 | |
318 | int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it |
319 | returns 1 then it is written to the output file. |
320 | ++++++++++++++++++++++++++++++++++++++*/ |
321 | |
322 | void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t)) |
323 | { |
324 | int *fds=NULL,*heap=NULL; |
325 | int nfiles=0,ndata=0; |
326 | index_t count=0,total=0; |
327 | FILESORT_VARINT nextitemsize,largestitemsize=0; |
328 | void *data=NULL,**datap=NULL; |
329 | char *filename; |
330 | int i,more=1; |
331 | |
332 | /* Allocate the RAM buffer and other bits */ |
333 | |
334 | data=malloc(option_filesort_ramsize); |
335 | |
336 | filename=(char*)malloc(strlen(option_tmpdirname)+24); |
337 | |
338 | /* Loop around, fill the buffer, sort the data and write a temporary file */ |
339 | |
340 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */ |
341 | goto tidy_and_exit; |
342 | |
343 | do |
344 | { |
345 | int fd,n=0; |
346 | size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE; |
347 | |
348 | datap=data+option_filesort_ramsize; |
349 | |
350 | /* Read in the data and create pointers */ |
351 | |
352 | while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data)) |
353 | { |
354 | FILESORT_VARINT itemsize=nextitemsize; |
355 | |
356 | if(itemsize>largestitemsize) |
357 | largestitemsize=itemsize; |
358 | |
359 | *(FILESORT_VARINT*)(data+ramused)=itemsize; |
360 | |
361 | ramused+=FILESORT_VARSIZE; |
362 | |
363 | ReadFile(fd_in,data+ramused,itemsize); |
364 | |
365 | *--datap=data+ramused; /* points to real data */ |
366 | |
367 | ramused+=itemsize; |
368 | |
369 | ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN); |
370 | ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE; |
371 | |
372 | total++; |
373 | n++; |
374 | |
375 | if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) |
376 | { |
377 | more=0; |
378 | break; |
379 | } |
380 | } |
381 | |
382 | /* No new data read in this time round */ |
383 | |
384 | if(n==0) |
385 | break; |
386 | |
387 | /* Sort the data pointers using a heap sort */ |
388 | |
389 | filesort_heapsort(datap,n,compare); |
390 | |
391 | /* Shortcut if all read in and sorted at once */ |
392 | |
393 | if(nfiles==0 && !more) |
394 | { |
395 | for(i=0;i<n;i++) |
396 | { |
397 | if(!buildindex || buildindex(datap[i],count)) |
398 | { |
399 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
400 | |
401 | WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
402 | count++; |
403 | } |
404 | } |
405 | |
406 | goto tidy_and_exit; |
407 | } |
408 | |
409 | /* Create a temporary file and write the result */ |
410 | |
411 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles); |
412 | |
413 | fd=OpenFileNew(filename); |
414 | |
415 | for(i=0;i<n;i++) |
416 | { |
417 | FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE); |
418 | |
419 | WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
420 | } |
421 | |
422 | CloseFile(fd); |
423 | |
424 | nfiles++; |
425 | } |
426 | while(more); |
427 | |
428 | /* Check that number of files is less than file size */ |
429 | |
430 | largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN); |
431 | |
432 | assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize)); |
433 | |
434 | /* Open all of the temporary files */ |
435 | |
436 | fds=(int*)malloc(nfiles*sizeof(int)); |
437 | |
438 | for(i=0;i<nfiles;i++) |
439 | { |
440 | sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i); |
441 | |
442 | fds[i]=ReOpenFile(filename); |
443 | |
444 | DeleteFile(filename); |
445 | } |
446 | |
447 | /* Perform an n-way merge using a binary heap */ |
448 | |
449 | heap=(int*)malloc(nfiles*sizeof(int)); |
450 | |
451 | datap=data+option_filesort_ramsize-nfiles*sizeof(void*); |
452 | |
453 | /* Fill the heap to start with */ |
454 | |
455 | for(i=0;i<nfiles;i++) |
456 | { |
457 | int index; |
458 | FILESORT_VARINT itemsize; |
459 | |
460 | datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize; |
461 | |
462 | ReadFile(fds[i],&itemsize,FILESORT_VARSIZE); |
463 | |
464 | *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize; |
465 | |
466 | ReadFile(fds[i],datap[i],itemsize); |
467 | |
468 | heap[i]=i; |
469 | |
470 | index=i; |
471 | |
472 | /* Bubble up the new value */ |
473 | |
474 | while(index>0 && |
475 | compare(datap[heap[index]],datap[heap[(index-1)/2]])<0) |
476 | { |
477 | int newindex; |
478 | int temp; |
479 | |
480 | newindex=(index-1)/2; |
481 | |
482 | temp=heap[index]; |
483 | heap[index]=heap[newindex]; |
484 | heap[newindex]=temp; |
485 | |
486 | index=newindex; |
487 | } |
488 | } |
489 | |
490 | /* Repeatedly pull out the root of the heap and refill from the same file */ |
491 | |
492 | ndata=nfiles; |
493 | |
494 | do |
495 | { |
496 | int index=0; |
497 | FILESORT_VARINT itemsize; |
498 | |
499 | if(!buildindex || buildindex(datap[heap[0]],count)) |
500 | { |
501 | itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE); |
502 | |
503 | WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE); |
504 | count++; |
505 | } |
506 | |
507 | if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE)) |
508 | { |
509 | ndata--; |
510 | heap[0]=heap[ndata]; |
511 | } |
512 | else |
513 | { |
514 | *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize; |
515 | |
516 | ReadFile(fds[heap[0]],datap[heap[0]],itemsize); |
517 | } |
518 | |
519 | /* Bubble down the new value */ |
520 | |
521 | while((2*index+2)<ndata && |
522 | (compare(datap[heap[index]],datap[heap[2*index+1]])>0 || |
523 | compare(datap[heap[index]],datap[heap[2*index+2]])>0)) |
524 | { |
525 | int newindex; |
526 | int temp; |
527 | |
528 | if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0) |
529 | newindex=2*index+1; |
530 | else |
531 | newindex=2*index+2; |
532 | |
533 | temp=heap[newindex]; |
534 | heap[newindex]=heap[index]; |
535 | heap[index]=temp; |
536 | |
537 | index=newindex; |
538 | } |
539 | |
540 | if((2*index+2)==ndata && |
541 | compare(datap[heap[index]],datap[heap[2*index+1]])>0) |
542 | { |
543 | int newindex; |
544 | int temp; |
545 | |
546 | newindex=2*index+1; |
547 | |
548 | temp=heap[newindex]; |
549 | heap[newindex]=heap[index]; |
550 | heap[index]=temp; |
551 | } |
552 | } |
553 | while(ndata>0); |
554 | |
555 | /* Tidy up */ |
556 | |
557 | tidy_and_exit: |
558 | |
559 | if(fds) |
560 | { |
561 | for(i=0;i<nfiles;i++) |
562 | CloseFile(fds[i]); |
563 | free(fds); |
564 | } |
565 | |
566 | if(heap) |
567 | free(heap); |
568 | |
569 | free(data); |
570 | free(filename); |
571 | } |
572 | |
573 | |
574 | /*++++++++++++++++++++++++++++++++++++++ |
575 | A function to sort an array of pointers efficiently. |
576 | |
577 | The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort, |
578 | in particular an this good because it can operate in-place and doesn't |
579 | allocate more memory like using qsort() does. |
580 | |
581 | void **datap A pointer to the array of pointers to sort. |
582 | |
583 | size_t nitems The number of items of data to sort. |
584 | |
585 | int(*compare)(const void *, const void *) The comparison function (identical to qsort if the |
586 | data to be sorted was an array of things not pointers). |
587 | ++++++++++++++++++++++++++++++++++++++*/ |
588 | |
589 | void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*)) |
590 | { |
591 | int i; |
592 | |
593 | /* Fill the heap by pretending to insert the data that is already there */ |
594 | |
595 | for(i=1;i<nitems;i++) |
596 | { |
597 | int index=i; |
598 | |
599 | /* Bubble up the new value (upside-down, put largest at top) */ |
600 | |
601 | while(index>0 && |
602 | compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */ |
603 | { |
604 | int newindex; |
605 | void *temp; |
606 | |
607 | newindex=(index-1)/2; |
608 | |
609 | temp=datap[index]; |
610 | datap[index]=datap[newindex]; |
611 | datap[newindex]=temp; |
612 | |
613 | index=newindex; |
614 | } |
615 | } |
616 | |
617 | /* Repeatedly pull out the root of the heap and swap with the bottom item */ |
618 | |
619 | for(i=nitems-1;i>0;i--) |
620 | { |
621 | int index=0; |
622 | void *temp; |
623 | |
624 | temp=datap[index]; |
625 | datap[index]=datap[i]; |
626 | datap[i]=temp; |
627 | |
628 | /* Bubble down the new value (upside-down, put largest at top) */ |
629 | |
630 | while((2*index+2)<i && |
631 | (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */ |
632 | compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */ |
633 | { |
634 | int newindex; |
635 | void *temp; |
636 | |
637 | if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */ |
638 | newindex=2*index+1; |
639 | else |
640 | newindex=2*index+2; |
641 | |
642 | temp=datap[newindex]; |
643 | datap[newindex]=datap[index]; |
644 | datap[index]=temp; |
645 | |
646 | index=newindex; |
647 | } |
648 | |
649 | if((2*index+2)==i && |
650 | compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */ |
651 | { |
652 | int newindex; |
653 | void *temp; |
654 | |
655 | newindex=2*index+1; |
656 | |
657 | temp=datap[newindex]; |
658 | datap[newindex]=datap[index]; |
659 | datap[index]=temp; |
660 | } |
661 | } |
662 | } |
Properties
Name | Value |
---|---|
cvs:description | Functions to perform sorting. |