CNIRevelator/src/mrz.py

599 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf8 -*-
"""
********************************************************************************
* CNIRevelator *
* *
* Desc: MRZ data dictionnary for CNIRevelator analyzer and *
* functions to analyze these data *
* *
* Copyright © 2018-2019 Adrien Bourmault (neox95) *
* *
* This file is part of CNIRevelator. *
* *
* CNIRevelator is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* any later version. *
* *
* CNIRevelator is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY*without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with CNIRevelator. If not, see <https:*www.gnu.org/licenses/>. *
********************************************************************************
"""
import re
import datetime
import logger # logger.py
import globs # globs.py
import lang # lang.py
import critical # critical.py
## SEX CODES
sexcode = {'M':'Homme', 'F':'Femme', 'X':'Non spécifié'}
## COUNTRY CODES
landcode2 = lang.all[globs.CNIRlang]["LANDCODE2"]
landcode3 = lang.all[globs.CNIRlang]["LANDCODE3"]
## DOCUMENTS TYPES
P = [
["11222333333333333333333333333333333333333333", "444444444566677777789AAAAAABCCCCCCCCCCCCCCDE"],
{
"1": ["2", "CODE", "P."],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["39", "NOM", "([A-Z]|<)+"],
"4": ["9", "NO", ".+"],
"5": ["1", "CTRL", "[0-9]", "4"],
"6": ["3", "NAT", "[A-Z]+"],
"7": ["6", "BDATE", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "7"],
"9": ["1", "SEX", "[A-Z]"],
"A": ["6", "EDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["14", "FACULT", ".+"],
"D": ["1", "CTRLF", "[0-9]", "C"],
"E": ["1", "CTRL", "[0-9]", "4578ABCD"]
},
lang.all[globs.CNIRlang]["Passeport lisible à la machine"]
]
IP = [
["112223333333334555555555555555", "66666678999999ABBBCCCCCCCCCCCD"],
{
"1": ["2", "CODE", "IP"],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["9", "NO", ".+"],
"4": ["1", "CTRL", "[0-9]", "3"],
"5": ["15", "FACULT", ".+"],
"6": ["6", "BDATE", "[0-9]+"],
"7": ["1", "CTRL", "[0-9]", "6"],
"8": ["1", "SEX", "[A-Z]"],
"9": ["6", "EDATE", "[0-9]+"],
"A": ["1", "CTRL", "[0-9]", "9"],
"B": ["3", "NAT", "[A-Z]+"],
"C": ["11", "FACULT", ".+"],
"D": ["1", "CTRL", "[0-9]", "345679AC"]
},
lang.all[globs.CNIRlang]["Carte-passeport"]
]
IDEUR = [
["112223333333334555555555555555", "66666678999999ABBBCCCCCCCCCCCD"],
{
"1": ["2", "CODE", "I."],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["9", "NO", ".+"],
"4": ["1", "CTRL", "[0-9]", "3"],
"5": ["15", "FACULT", ".+"],
"6": ["6", "BDATE", "[0-9]+"],
"7": ["1", "CTRL", "[0-9]", "6"],
"8": ["1", "SEX", "[A-Z]"],
"9": ["6", "EDATE", "[0-9]+"],
"A": ["1", "CTRL", "[0-9]", "9"],
"B": ["3", "NAT", "[A-Z]+"],
"C": ["11", "FACULT", ".+"],
"D": ["1", "CTRL", "[0-9]", "345679AC"]
},
lang.all[globs.CNIRlang]["Carte didentité européenne"]
]
TSEUR = [
["112223333333334555555555555555", "66666678999999ABBBCCCCCCCCCCCD"],
{
"1": ["2", "CODE", "IR"],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["9", "NO", ".+"],
"4": ["1", "CTRL", "[0-9]", "3"],
"5": ["15", "FACULT", ".+"],
"6": ["6", "BDATE", "[0-9]+"],
"7": ["1", "CTRL", "[0-9]", "6"],
"8": ["1", "SEX", "[A-Z]"],
"9": ["6", "EDATE", "[0-9]+"],
"A": ["1", "CTRL", "[0-9]", "9"],
"B": ["3", "NAT", "[A-Z]+"],
"C": ["11", "FACULT", ".+"],
"D": ["1", "CTRL", "[0-9]", "345679AC"]
},
lang.all[globs.CNIRlang]["Carte de séjour européenne"]
]
AC = [
["112223333333334EEE555555555555", "66666678999999ABBBCCCCCCCCCCCD"],
{
"1": ["2", "CODE", "AC"],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["9", "NO", ".+"],
"4": ["1", "CTRL", "[0-9]", "3"],
"E": ["3", "INDIC", "[A-Z]{1,2}."],
"5": ["12", "FACULT", ".+"],
"6": ["6", "BDATE", "[0-9]+ "],
"7": ["1", "CTRL", "[0-9]", "6"],
"8": ["1", "SEX", "[A-Z]"],
"9": ["6", "EDATE", "[0-9]+"],
"A": ["1", "CTRL", "[0-9]", "9"],
"B": ["3", "NAT", "[A-Z]+"],
"C": ["11", "FACULT", ".+"],
"D": ["1", "CTRL", "[0-9]","345679AC"]
},
lang.all[globs.CNIRlang]["Certificat de membre d'équipage"]
]
VA = [
["11222333333333333333333333333333333333333333", "444444444566677777789AAAAAABCCCCCCCCCCCCCCCC"],
{
"1": ["2", "CODE", "V."],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["39", "NOM", "([A-Z]|<)+"],
"4": ["9", "NO", ".+"],
"5": ["1", "CTRL", "[0-9]","4"],
"6": ["3", "NAT", "[A-Z]+"],
"7": ["6", "BDATE", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "7"],
"9": ["1", "SEX", "[A-Z]"],
"A": ["6", "EDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["16", "FACULT", ".+"]
},
lang.all[globs.CNIRlang]["Visa de type A"]
]
VB = [
["112223333333333333333333333333333333", "444444444566677777789AAAAAABCCCCCCCC"],
{
"1": ["2", "CODE", "V."],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["31", "NOM", "([A-Z]|<)+"],
"4": ["9", "NO", ".+"],
"5": ["1", "CTRL", "[0-9]","4"],
"6": ["3", "NAT", "[A-Z]+"],
"7": ["6", "BDATE", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "7"],
"9": ["1", "SEX", "[A-Z]"],
"A": ["6", "EDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["8", "FACULT", ".+"]
},
lang.all[globs.CNIRlang]["Visa de type B"]
]
TSF = [
["112223333333333333333333333333333333", "444444444566677777789AAAAAABCCCCCC"],
{
"1": ["2", "CODE", "TS"],
"2": ["3", "PAYS", "FRA"],
"3": ["31", "NOM", "([A-Z]|<)+"],
"4": ["9", "NO", ".+"],
"5": ["1", "CTRL", "[0-9]","4"],
"6": ["3", "NAT", "[A-Z]+"],
"7": ["6", "BDATE", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "7"],
"9": ["1", "SEX", "[A-Z]"],
"A": ["6", "EDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["8", "FACULT", ".+"]
},
lang.all[globs.CNIRlang]["Carte de séjour FR"]
]
TDV = [
["112223333333333333333333333333333333", "444444444566677777789AAAAAABCCCCCCCD"],
{
"1": ["2", "CODE", "I."],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["31", "NOM", "([A-Z]|<)+"],
"4": ["9", "NO", ".+"],
"5": ["1", "CTRL", "[0-9]", "4"],
"6": ["3", "NAT", "[A-Z]+"],
"7": ["6", "BDATE", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "7"],
"9": ["1", "SEX", "[A-Z]"],
"A": ["6", "EDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["7", "FACULT", ".+"],
"D": ["1", "CTRL", "[0-9]", "4578ABC"]
},
lang.all[globs.CNIRlang]["Titre d'identité/de voyage"]
]
IDFR = [
["112223333333333333333333333333444444", "555566677777899999999999999AAAAAABCD"],
{
"1": ["2", "CODE", "ID"],
"2": ["3", "PAYS", "FRA"],
"3": ["25", "NOM", "([A-Z]|<)+"],
"4": ["6", "NOINT", ".+"],
"5": ["4", "DDATE", "[0-9]+"],
"6": ["3", "NOINT2", "[0-9]+"],
"7": ["5", "NOINT3", "[0-9]+"],
"8": ["1", "CTRL", "[0-9]", "567"],
"9": ["14", "PRENOM", "[A-Z]"],
"A": ["6", "BDATE", "[0-9]+"],
"B": ["1", "CTRL", "[0-9]", "A"],
"C": ["1", "SEX", "[A-Z]"],
"D": ["1", "CTRL", "[0-9]", "123456789ABCE"]
},
lang.all[globs.CNIRlang]["Pièce d'identité FR"]
]
DL = [
["112223333333334555555666666667", ""],
{
"1": ["2", "CODE", "D1"],
"2": ["3", "PAYS", "[A-Z]+"],
"3": ["9", "NO", "[0-9]{2}[A-Z]{2}[0-9]{5}"],
"4": ["1", "CTRL", "[0-9]", "123"],
"5": ["6", "EDATE", "[0-9]+"],
"6": ["8", "NOM", "([A-Z]|<)+"],
"7": ["1", "CTRL", "[0-9]", "123456"]
},
lang.all[globs.CNIRlang]["Permis de conduire"]
]
TYPES = [IDFR, TDV, VB, VA, AC, IDEUR, IP, P, DL, TSF, TSEUR]
# longest document MRZ line
longest = max([len(x[0][0]) for x in TYPES])
## THE ROOT OF THIS PROJECT !
def getDocString(doc):
return doc[0][0] + doc[0][1]
def getFieldLimits(doc, fieldtype):
"""
This function returns the limit of a given field string id for a given document structure
"""
L1 = limits(doc[0][0], fieldtype)
L2 = limits(doc[0][1], fieldtype)
if -1 in L1:
return 1, L2
else:
return 0, L1
return
def limits(line, fieldtype):
"""
Returns the limit of a given field structure
"""
a = line.find(fieldtype)
b = line.rfind(fieldtype)
return (a,b+1)
def completeDocField(doc, code, position):
"""
Completes with '<' the document the field that is located at given position
"""
field = getDocString(doc)[position]
limit = limits(getDocString(doc), field)
res = limit[1] - position
#print("field : {}, limit : {}, number of char to complete : {}".format(field, limit, res))
return res
def docMatch(doc, strs):
"""
This function calculates a regex match score for a given document and a string couple
"""
# Global handler
logfile = logger.logCur
level = 0
nchar = 0
bonus = 0
for i in range(0,2):
cursor = 0
#print("Line : {}".format(i))
while True:
if cursor > len(doc[0][i]) - 1:
break
# Getting the type of field on the cursor position
fieldtype = doc[0][i][cursor]
lim = limits(doc[0][i], fieldtype)
# ready for next field
cursor = lim[1]
# get the current field and isolates it
field = doc[0][i][ lim[0]:lim[1] ]
fstr = strs[i][ lim[0]:lim[1] ]
# Prepare regex compilation
regex = re.compile(doc[1][fieldtype][2])
# Test the match
matching = regex.match(fstr)
# Retrieve the mathing level
if matching:
level += matching.end()
if fieldtype == "1":
bonus += 100
nchar += int(doc[1][fieldtype][0])
# Print for debug
# print("Field : {}, type = {}, on str : {}".format(field, fieldtype, fstr))
# logfile.printdbg(" REGEX : {}, match : {}".format(regex, matching))
# exit the loop
logfile.printdbg("{} level : {}/{} (+{})".format(doc[2], level, nchar, bonus))
return (level, nchar, bonus)
def allDocMatch(strs, final=False):
"""
This functions test all documents types on the lines provided and returns a score for each
"""
# Global handler
logfile = logger.logCur
#print(strs)
SCORES = []
for doc in TYPES:
# Get the score of the document on the strings
level, nchar, bonus = docMatch(doc, strs)
# Number of characters compatibles + bonus with the doc indication
SCORES += [ level + bonus ]
# if the len of strings is the same than document, add a bonus
# but only if we are in a final situation
if final:
if len(strs[0] + strs[1]) == nchar:
SCORES[-1] += 100
candidate = SCORES.index(max(SCORES))
candidates = []
canditxt = []
# Search the candidates
for i in range(len(SCORES)):
if SCORES[i] == SCORES[candidate]:
candidates += [TYPES[i]]
canditxt += [TYPES[i][2]]
# Continue searching
if len(candidates) < 2:
tempRemovedCandidate = SCORES.pop(candidate)
if (SCORES.index(max(SCORES)) != candidate) and (max(SCORES) >= tempRemovedCandidate - 20):
if SCORES.index(max(SCORES)) < candidate:
candidates += [ TYPES[SCORES.index(max(SCORES))] ]
else:
candidates += [ TYPES[SCORES.index(max(SCORES)) + 1] ]
SCORES.insert(candidate, tempRemovedCandidate)
# Return the candidates
logfile.printdbg("Scores : {}".format(SCORES))
logfile.printdbg("Candidates : {}".format(canditxt))
return candidates
def computeControlSum(code):
"""
This function computes a control sum for the given characters
"""
resultat = 0
i = -1
facteur = [7, 3, 1]
for car in code:
if car == '<' or car == '\n':
valeur = 0
i += 1
else:
if car in '0123456789':
valeur = int(car)
i += 1
else:
if car in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
valeur = ord(car) - 55
i += 1
else:
break
resultat += valeur * facteur[(i % 3)]
return resultat % 10
def computeAllControlSum(doc, code):
"""
This function computes all the ctrl sums on a MRZ string and returns all the results
it returns the misc infos about the document too
"""
ctrlSumList = []
facult = False
# iteration on each char of the given MRZ
for charPos in range(len(code)):
# Sanity check
if len(getDocString(doc)) <= charPos:
break
field = getDocString(doc)[charPos]
if doc[1][field][1] == "CTRL":
#print("{} is CTRL field {}".format(code[charPos], field))
codeChain = ""
# iteration on the fields to control
for pos in range(len(code)):
#print("Len : {}, pos : {}".format(len(getDocString(doc)), pos))
# Sanity check
if len(getDocString(doc)) <= pos:
break
target = getDocString(doc)[pos]
if target in doc[1][field][3]:
#print("__field : {} {} {} {}".format(target, pos, field, doc[1][field][3]))
codeChain += code[pos]
#print("chain to control : _{}_".format(codeChain))
ctrlSum = computeControlSum(codeChain)
#print("SUM : {} vs {}".format(code[charPos], ctrlSum))
ctrlSumList += [ (field, charPos, ctrlSum, facult) ]
if doc[1][field][1] == "CTRLF":
#print("{} is CTRL field {}".format(code[charPos], field))
codeChain = ""
# iteration on the fields to control
for pos in range(len(code)):
target = getDocString(doc)[pos]
if target in doc[1][field][3]:
#print("__field : {} {} {} {}".format(target, pos, field, doc[1][field][3]))
codeChain += code[pos]
#print("chain to control : _{}_".format(codeChain))
ctrlSum = computeControlSum(codeChain)
#print("SUM : {} vs {}".format(code[charPos], ctrlSum))
if code[charPos] == "<":
facult = True
ctrlSumList += [ (field, charPos, ctrlSum, facult) ]
return {
"ctrlSumList" : ctrlSumList
}
def getDocInfos(doc, code):
# get all the types of infos that are in the document doc
infoTypes = [ (doc[1][field][1], limits(doc[0][0] + doc[0][1], field)) for field in doc[1] ]
res = {}
# Length of MRZ
length = len(code)
if length == len(doc[0][0]+doc[0][1]):
res["LEN"] = [length, True]
else:
res["LEN"] = [length, False]
for field in infoTypes:
value = code[ field[1][0] : field[1][1] ].replace("<", " ").strip()
res[field[0]] = [0,0]
# State code
if field[0] == 'PAYS' or field[0] == 'NAT':
try:
if len(value) == 3 and value[-1] != "<":
res[field[0]] = (landcode3[value], True)
elif len(value) == 3 and value[-1] == "<":
res[field[0]] = (landcode2[value[:-1]], True)
else:
res[field[0]] = (landcode2[value], True)
except KeyError:
res[field[0]] = [value, False]
# Dates
elif field[0][1:] == 'DATE':
# size adaptation
if len(value) == 6:
value = "{}/{}/{}".format(value[4:6], value[2:4], value[0:2])
elif len(value) == 4:
value = "{}/{}/{}".format("01", value[2:4], value[0:2])
# date validation
try:
datetime.datetime.strptime(value,"%d/%m/%y")
except ValueError:
#print(value)
if value != "":
res[field[0]] = [value, False]
else:
res[field[0]] = [value, True]
# Numbers
elif field[0][:-1] == 'NOINT':
try:
res["NO"][0] += value
res["NO"][1] = True
except KeyError:
res["NO"] = [value, True]
elif field[0] == 'NOINT':
try:
res["NO"][0] += value
res["NO"][1] = True
except KeyError:
res["NO"] = [value, True]
elif field[0] == 'FACULT':
try:
res["INDIC"][0] += value
res["INDIC"][1] = True
except KeyError:
res["INDIC"] = [value, True]
# Sex
elif field[0] == 'SEX':
if not value in "MF":
res[field[0]] = [value, False]
else:
res[field[0]] = [value, True]
# All other cases
else:
if value != "":
res[field[0]] = [value, True]
return res