-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexternal_merge_sort.py
More file actions
117 lines (92 loc) · 3.03 KB
/
external_merge_sort.py
File metadata and controls
117 lines (92 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""External merge sort.
The implementation below is based on:
http://web.archive.org/web/20230202171450/https://www.geeksforgeeks.org/external-sorting/
```bash
python external_merge_sort.py
sort input.txt > expected.txt
cmp expected.txt output.txt
rm expected.txt input.txt output.txt partition*.txt
```
Sources:
* http://web.archive.org/web/20230202171450/https://www.geeksforgeeks.org/external-sorting/
* http://web.archive.org/web/20230202171737/https://www.geeksforgeeks.org/internal-implementation-of-linux-sort-command/
* https://github.com/coreutils/coreutils/blob/master/src/sort.c
* https://en.wikipedia.org/wiki/External_sorting#External_merge_sort
* http://web.archive.org/web/20230202171847/http://vkundeti.blogspot.com/2008/03/tech-algorithmic-details-of-unix-sort.html
* http://web.archive.org/web/20230202172221/https://www.geeksforgeeks.org/merge-sort-using-multi-threading/
"""
import heapq
import psutil
import string
import numpy as np
def _get_partition_file(idx):
return "partition{idx}.txt".format(idx=idx)
def partition_and_sort(input_file, partition_size, num_partitions):
fin = open(input_file, "r")
fout = []
for i in range(num_partitions):
partition_file = _get_partition_file(i)
fout.append(open(partition_file, "w"))
fidx = 0
lines = []
for line in fin:
lines.append(line.strip() + "\n")
if len(lines) == partition_size:
lines.sort()
for line in lines:
fout[fidx].write(line)
fidx += 1
lines = []
if len(lines) > 0:
lines.sort()
for line in lines:
fout[fidx].write(line)
for i in range(len(fout)):
fout[i].close()
fin.close()
def merge(output_file, num_partitions):
fin = []
for i in range(num_partitions):
partition_file = _get_partition_file(i)
fin.append(open(partition_file, "r"))
fout = open(output_file, "w")
# Seed the heap with a line
# from each file.
heap = []
for i in range(len(fin)):
line = fin[i].readline()
if not line:
break
heapq.heappush(heap, (line, i))
num_finished = 0
while num_finished <= i:
root = heapq.heappop(heap)
fout.write(root[0])
# Whenever we write a line from one of the files,
# we read a line from that file and add it to heap,
# so that the heap always contains one line from
# each file (until we exhaust the file).
line = fin[root[1]].readline()
if line:
heapq.heappush(heap, (line, root[1]))
else:
num_finished += 1
for i in range(len(fin)):
fin[i].close()
fout.close()
def external_sort(input_file, output_file, partition_size, num_partitions):
partition_and_sort(input_file, partition_size, num_partitions)
merge(output_file, num_partitions)
if __name__ == "__main__":
num_partitions = 10
partition_size = 1000
input_file = "input.txt"
output_file = "output.txt"
rng = np.random.default_rng(seed=0)
words = []
alphabet = [s for s in string.ascii_lowercase]
for _ in range(num_partitions * partition_size):
words.append("".join(rng.choice(alphabet, size=10)))
with open(input_file, "w") as fin:
fin.write("\n".join(words))
external_sort(input_file, output_file, partition_size, num_partitions)