2 回答
TA贡献1744条经验 获得超4个赞
我的两个理论是 (1) 内存映射文件并为每个值搜索使用多行正则表达式,以及 (2) 将工作分配给多个子进程。我将两者结合起来,得出以下结论。也许可以在父进程中执行 mmap 并共享,但我走的是简单的路线,只是在每个子进程中都这样做,假设操作系统会为您找出有效的共享。
import multiprocessing as mp
import os
import mmap
import re
def _value_find_worker_init(filename):
"""Called when initializing mp.Pool to open an mmaped file in subprocesses.
The file is `global mmap_file` so that the worker can find it.
"""
global mmap_file
filenames_fd = os.open(filename, os.O_RDONLY)
mmap_file = mmap.mmap(filenames_fd, length=os.stat(filename).st_size,
access=mmap.ACCESS_READ)
def _value_find_worker(value):
"""Return a list of matching lines in `global mmap_file`"""
# multiline regex for findall
regex = b"(?m)^.*?" + value + b".*?$"
matched = re.compile(regex).findall(mmap_file)
print(regex, matched)
return matched
def find_unique():
with open("UniqueValueList.txt", "rb") as g:
uniqueValues = [line.strip() for line in g]
with open('UniqueValueList.txt', "rb") as g:
uniqueValues = [line.strip() for line in g]
with mp.Pool(initializer=_value_find_worker_init,
initargs=("Filenames_File.txt",)) as pool:
matched_values = set()
for matches in pool.imap_unordered(_value_find_worker, uniqueValues):
matched_values.update(matches)
with open("Filenames_With_Unique_Values.txt", "wb") as outfile:
outfile.writelines(value + b"\n" for value in matched_values)
find_unique()
添加回答
举报