Source code for sotastream.utils.phrases
import random
[docs]
class PhraseSpanExtractor:
"""Re-implementation of phrase span extraction algorithm from Moses"""
def __init__(self, srcSpans, trgSpans, alignment, maxLength=7):
self.srcSpans = srcSpans
self.trgSpans = trgSpans
self.alignment = alignment
self.maxLength = maxLength
self.srcLength = len(srcSpans)
self.trgLength = len(trgSpans)
self.phrases = []
self.marked = set([q for _, q in alignment])
[docs]
def extract(self, srcStart, srcEnd, trgStart, trgEnd):
if trgEnd == -1:
return []
for p, q in self.alignment:
if trgStart <= q <= trgEnd and (p < srcStart or p > srcEnd):
return []
E = []
ts = trgStart
while True:
te = trgEnd
while True:
if te - ts < self.maxLength:
E.append(((srcStart, srcEnd), (ts, te)))
else:
break
te += 1
if te in self.marked or te >= self.trgLength:
break
ts -= 1
if ts in self.marked or ts < 0:
break
return E
[docs]
def computePhraseSpans(self):
for srcStart in range(self.srcLength):
for srcEnd in range(srcStart, self.srcLength):
if srcEnd - srcStart >= self.maxLength:
break
trgStart = self.trgLength - 1
trgEnd = -1
for p, q in self.alignment:
if srcStart <= p <= srcEnd:
trgStart = min(q, trgStart)
trgEnd = max(q, trgEnd)
E = self.extract(srcStart, srcEnd, trgStart, trgEnd)
for p in E:
(sb, se), (tb, te) = p
self.phrases.append(
(
(self.srcSpans[sb][0], self.srcSpans[se][1]),
(self.trgSpans[tb][0], self.trgSpans[te][1]),
)
)
[docs]
def samplePhraseSpans(self, k=1):
k = min(k, len(self.phrases))
if k:
return random.choices(
self.phrases, weights=[2 / (s[1] - s[0] + t[1] - t[0] + 2) for s, t in self.phrases], k=k
)
else:
return []