adjust text similarity metrics

This commit is contained in:
Yi Ge 2019-04-27 03:18:27 +02:00
parent a3986b3279
commit 3f73cb9bca

View File

@ -48,12 +48,12 @@ class PredictedFrame:
self.text = ' '.join(word.text for word in self.words) self.text = ' '.join(word.text for word in self.words)
# remove chars that are obviously ocr errors # remove chars that are obviously ocr errors
translate_table = {ord(c): None for c in '<>{};`@#$%^*_=\\'} translate_table = {ord(c): None for c in '<>{}[];`@#$%^*_=~\\'}
translate_table[ord('|')] = 'I' translate_table[ord('|')] = 'I'
self.text = self.text.translate(translate_table).strip() self.text = self.text.translate(translate_table).strip()
def is_similar_to(self, other: PredictedFrame, threshold=70) -> bool: def is_similar_to(self, other: PredictedFrame, threshold=70) -> bool:
return fuzz.partial_ratio(self.text, other.text) >= threshold return fuzz.ratio(self.text, other.text) >= threshold
class PredictedSubtitle: class PredictedSubtitle:
@ -81,7 +81,7 @@ class PredictedSubtitle:
return self.frames[-1].index return self.frames[-1].index
return 0 return 0
def is_similar_to(self, other: PredictedSubtitle, threshold=70) -> bool: def is_similar_to(self, other: PredictedSubtitle, threshold=90) -> bool:
return fuzz.partial_ratio(self.text, other.text) >= threshold return fuzz.partial_ratio(self.text, other.text) >= threshold
def __repr__(self): def __repr__(self):