-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2vtool.py
85 lines (69 loc) · 2.62 KB
/
w2vtool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
__author__ = 'Kerem'
import gensim,string,argparse,sys
class Feeder():
def __init__(self,path):
self.file = path
def __iter__(self):
with open(self.file,'r') as f:
for line in f.readlines():
yield line.strip()
class VectorCreator():
def __init__(self,input,output,workers,size):
self.workers = workers
self.size = size
if input is None:
input = 'preprocessed.txt'
self.sentences = Feeder(input)
if output is None:
output = 'model'
self.output = output
def create(self):
model = gensim.models.Word2Vec(self.sentences,size = self.size, workers = self.workers)
model.save(self.output)
class Preprocessor():
def __init__(self,input,output=None):
rmv = string.punctuation '‘’“”' # can be extended. need update for opennmt seperators
self.translator = str.maketrans('','',rmv)
self.input = input
if output:
self.output = output
else:
self.output = 'preprocessed.txt'
def write_file(self,line):
with open(self.output,'a') as f:
print(line,file=f)
def preprocess(self):
f = Feeder(self.input)
for line in f:
self.write_file(line.translate(self.translator).lower()) # do sth for repeating spaces
class Runner():
def __init__(self,args):
self.input = args.input
self.run(args)
def run(self,args):
if args.only_preprocess:
p = Preprocessor(self.input,args.only_preprocess)
p.preprocess()
elif args.only_train:
vc = VectorCreator(args.input,args.only_train,args.workers,args.size)
vc.create()
else:
p = Preprocessor(self.input)
p.preprocess()
vc = VectorCreator(args.input,args.model,args.workers,args.size)
vc.create()
def main(arguments):
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--input', help="Input to generate vectors (raw file for default, preprocessed file for only-train mode )", required = True, type = str)
parser.add_argument('--model', help="Name for model", required = False, type = str, default = 'model')
parser.add_argument('--workers', help="Workers for training (default:4)", required = False, type = int, default = 4)
parser.add_argument('--size', help="Dimension of vectors (default:300)", required = False, type = int, default = 300)
parser.add_argument('--only-preprocess', help="Only preprocess input file, enter name for output", required = False, type=str)
parser.add_argument('--only-train', help="Only train preprocessed input file, enter name for output", required = False, type=str)
args = parser.parse_args(arguments)
if args.input:
r = Runner(args)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))