-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathnfcorpus.py
180 lines (151 loc) · 7.37 KB
/
nfcorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import io
import codecs
import re
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation
from ir_datasets.formats import TsvQueries, TsvDocs, TrecQrels, TrecScoredDocs, TsvDocPairs, BaseQueries
NAME = 'nfcorpus'
_logger = ir_datasets.log.easy()
QRELS_DEFS = {
2: "A direct link from the query to the document the cited sources section of a page.",
1: "A link exists from the query to another query that directly links to the document.",
0: "Marginally relevant, based on topic containment.",
}
class NfCorpusDoc(NamedTuple):
doc_id: str
url: str
title: str
abstract: str
def default_text(self):
"""
title and abstract
"""
return f'{self.title} {self.abstract}'
class NfCorpusQuery(NamedTuple):
query_id: str
title: str
all: str
def default_text(self):
"""
title
"""
return self.title
class NfCorpusVideoQuery(NamedTuple):
query_id: str
title: str
desc: str
def default_text(self):
"""
title
"""
return self.title
class ZipQueries(BaseQueries):
def __init__(self, queries, idxs, qtype):
self._queries = queries
self._idxs = idxs
self._qtype = qtype
def queries_iter(self):
for qs in zip(*(q.queries_iter() for q in self._queries)):
assert len({q.query_id for q in qs}) == 1 # all query IDs should be the same
yield self._qtype(*(qs[i][j] for i, j in self._idxs))
def queries_cls(self):
return self._qtype
def queries_path(self):
return self._queries[0].queries_path()
def queries_namespace(self):
return NAME
def queries_lang(self):
return self._queries[0].queries_lang()
def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
main_dlc = dlc['main']
collection = TsvDocs(Cache(TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path/'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
subsets = {}
def read_lines(file):
file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path/file)
with file.stream() as stream:
stream = codecs.getreader('utf8')(stream)
return {l.rstrip() for l in stream}
nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids'))
video_qid_filter = Lazy(lambda: read_lines('all_videos.ids'))
subsets['train'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path/'train/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path/'train/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path/'train/qrels'), QRELS_DEFS),
documentation('train'),
)
subsets['train/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path/'train/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('train/nontopic'),
)
subsets['train/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path/'train/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path/'train/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'),
documentation('train/video'),
)
subsets['dev'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path/'dev/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path/'dev/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path/'dev/qrels'), QRELS_DEFS),
documentation('dev'),
)
subsets['dev/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path/'dev/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('dev/nontopic'),
)
subsets['dev/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path/'dev/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path/'dev/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'),
documentation('dev/video'),
)
subsets['test'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path/'test/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path/'test/queries.all.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery),
TrecQrels(Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path/'test/qrels'), QRELS_DEFS),
documentation('test'),
)
subsets['test/nontopic'] = Dataset(
collection,
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path/'test/nontopic/queries.tsv'), namespace=NAME, lang='en'),
FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'),
documentation('test/nontopic'),
)
subsets['test/video'] = Dataset(
collection,
ZipQueries([
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path/'test/video/queries.titles.tsv'), namespace=NAME, lang='en'),
TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path/'test/video/queries.desc.tsv'), namespace=NAME, lang='en'),
], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery),
FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'),
documentation('test/video'),
)
ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return collection, subsets
collection, subsets = _init()