% % jp-encoding.mzn % % % There are several popular character encodings in Japan. % (EUC-JP, SJIS, UTF-8) % % If they are mixed into one text file (by accident or something), % text information will be lost. % So we can define a problem "recovering original encodings" for byte stream. % % In this problem, byte stream (as variable 'stream') is given as input % and encodings are assigned as output (as variable 'encoding'). % include "table.mzn"; % *_score tables have -log(probability of appearance) * 10 for each encoding. array[1..256] of int: sjis_score = [ 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 57, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 74, 135, 128, 119, 135, 100, 135, 135, 96, 96, 112, 104, 106, 92, 106, 107, 97, 92, 99, 95, 98, 106, 107, 103, 97, 101, 105, 135, 135, 111, 135, 119, 65, 46, 45, 64, 75, 74, 79, 78, 71, 74, 79, 76, 71, 79, 59, 64, 81, 78, 70, 67, 80, 81, 70, 82, 69, 81, 75, 72, 62, 60, 73, 70, 76, 79, 75, 66, 77, 75, 78, 73, 77, 43, 43, 79, 54, 68, 68, 61, 73, 67, 80, 66, 69, 53, 51, 72, 73, 68, 74, 71, 77, 80, 77, 135, 77, 28, 11, 51, 75, 73, 69, 69, 51, 47, 49, 49, 44, 49, 41, 47, 44, 48, 48, 47, 50, 46, 45, 48, 63, 76, 75, 79, 74, 68, 63, 73, 50, 67, 40, 80, 45, 79, 56, 71, 59, 42, 45, 55, 67, 49, 68, 58, 69, 53, 65, 54, 69, 45, 57, 50, 64, 60, 65, 54, 67, 44, 49, 58, 66, 47, 58, 74, 43, 47, 43, 59, 43, 45, 62, 62, 40, 45, 60, 72, 71, 69, 72, 68, 61, 80, 62, 68, 72, 68, 70, 75, 51, 66, 67, 61, 47, 56, 59, 61, 62, 53, 53, 47, 51, 43, 49, 60, 74, 60, 83, 74, 47, 47, 76, 71, 78, 82, 77, 83, 80, 81, 70, 67, 67, 135, 135, 135 ]; array[1..256] of int: eucjp_score = [ 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 57, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 74, 135, 128, 119, 135, 100, 135, 135, 96, 96, 112, 104, 106, 92, 106, 107, 97, 92, 99, 95, 98, 106, 107, 103, 97, 101, 105, 135, 135, 111, 135, 119, 135, 112, 114, 100, 108, 102, 135, 113, 121, 110, 124, 135, 111, 106, 111, 111, 100, 135, 114, 109, 112, 124, 135, 135, 124, 119, 128, 128, 121, 128, 135, 135, 135, 94, 128, 102, 104, 95, 113, 97, 108, 90, 124, 124, 99, 101, 95, 90, 105, 135, 93, 99, 94, 108, 128, 113, 135, 124, 117, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 28, 41, 44, 11, 50, 45, 72, 55, 64, 57, 42, 45, 54, 65, 46, 49, 49, 51, 48, 50, 48, 54, 41, 45, 45, 52, 44, 48, 45, 49, 39, 43, 50, 51, 43, 49, 53, 40, 43, 41, 50, 34, 36, 48, 45, 38, 43, 53, 66, 62, 66, 61, 62, 49, 50, 59, 62, 62, 63, 63, 68, 50, 63, 63, 58, 46, 55, 57, 59, 58, 52, 50, 46, 49, 43, 48, 58, 63, 58, 73, 67, 46, 46, 70, 68, 58, 66, 71, 73, 72, 74, 66, 61, 58, 135 ]; array[1..256] of int: utf8_score = [ 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 61, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 78, 139, 132, 123, 139, 104, 139, 139, 100, 100, 116, 108, 110, 96, 110, 111, 101, 96, 103, 99, 102, 110, 111, 107, 101, 105, 109, 139, 139, 115, 139, 139, 139, 116, 118, 104, 112, 107, 139, 117, 125, 114, 128, 139, 115, 110, 115, 115, 104, 139, 118, 113, 116, 128, 139, 139, 128, 123, 132, 132, 125, 132, 139, 139, 139, 98, 132, 107, 108, 99, 117, 101, 112, 95, 128, 128, 103, 105, 99, 94, 109, 139, 97, 103, 98, 112, 132, 117, 139, 128, 121, 139, 139, 139, 139, 139, 36, 17, 27, 51, 42, 55, 47, 54, 41, 40, 49, 38, 42, 47, 64, 48, 53, 52, 49, 45, 56, 51, 56, 46, 58, 50, 58, 52, 57, 51, 65, 46, 51, 53, 63, 48, 54, 54, 44, 48, 45, 54, 45, 46, 61, 53, 42, 47, 53, 61, 63, 57, 61, 63, 62, 62, 48, 59, 45, 53, 39, 57, 49, 57, 139, 139, 139, 132, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 118, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 68, 14, 42, 36, 40, 44, 44, 47, 139, 139, 139, 139, 139, 40, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139 ]; % % labels for encoding % int: e_ascii = 0; int: e_euc_jp = 1; int: e_sjis = 2; int: e_utf8 = 3; int: e_unknown = 4; % % labels for byte_status % int: b_ascii = 0; int: b_euc1 = 1; int: b_euc2 = 2; int: b_sjis1_1 = 3; int: b_sjis2_1 = 4; int: b_sjis2_2 = 5; int: b_utf8_2_1 = 6; int: b_utf8_2_2 = 7; int: b_utf8_3_1 = 8; int: b_utf8_3_2 = 9; int: b_utf8_3_3 = 10; int: b_utf8_4_1 = 11; int: b_utf8_4_2 = 12; int: b_utf8_4_3 = 13; int: b_utf8_4_4 = 14; int: b_unknown = 15; % % test data % % int: len = 12; % array[1..len] of int: stream = [ 227, 129, 138, 227, 129, 175, 227, 130, 136, 227, 129, 134 ]; int: len; set of int: rng = 1..len; array[rng] of int: stream; % int: len = 3 ; % array[rng] of int: stream = [ 65, 66, 67 ]; %------------------------------------------------------------------------------% % Variables array[rng] of var 0..b_unknown: byte_status; array[rng] of var 0..e_unknown: encoding; array[rng] of var 0..1: char_start; set of int: scoreType = 0..max(i in 1..256)(eucjp_score[i] + sjis_score[i] + utf8_score[i]) + 1000; int: maxObj = len * (max(i in 1..256)(eucjp_score[i] + sjis_score[i] + utf8_score[i]) + 1000); var 0..maxObj: objective; constraint objective = sum (i in rng) ( score_of(encoding[i],stream[i]) ); predicate score_computation(var 0..e_unknown:encoding_i, var 0..255:stream_i, var scoreType: score_i) :: presolve(autotable(model)) = score_i = (bool2int(encoding_i == e_euc_jp) * eucjp_score[stream_i+1] + bool2int(encoding_i == e_sjis) * sjis_score[stream_i+1] + bool2int(encoding_i == e_utf8) * utf8_score[stream_i+1] + bool2int(encoding_i == e_unknown) * 1000); function var scoreType: score_of(var int: encoding_i, int: stream_i) = let{ var scoreType: score_i; constraint score_computation(encoding_i,stream_i, score_i); } in score_i; %------------------------------------------------------------------------------% % Constraints constraint forall (i in rng) ( if i >= 2 then link_statuses(byte_status[i],byte_status[i-1]) else true endif /\ link_vars(encoding[i],byte_status[i],char_start[i]) /\ init(byte_status[i],i) /\ stream_to_status(i,stream,byte_status[i]) ); predicate link_vars(var int: encoding_i, var int: byte_status_i, var int: char_start_i) :: presolve(autotable) = (forall([ % % byte_status -> encoding % encoding_i = e_ascii <-> byte_status_i = b_ascii , encoding_i = e_euc_jp -> byte_status_i = b_euc1 \/ byte_status_i = b_euc2 , (encoding_i = e_sjis -> byte_status_i = b_sjis1_1 \/ byte_status_i = b_sjis2_1 \/ byte_status_i = b_sjis2_2) , encoding_i = e_utf8 -> byte_status_i = b_utf8_2_1 \/ byte_status_i = b_utf8_2_2 \/ byte_status_i = b_utf8_3_1 \/ byte_status_i = b_utf8_3_2 \/ byte_status_i = b_utf8_3_3 \/ byte_status_i = b_utf8_4_1 \/ byte_status_i = b_utf8_4_2 \/ byte_status_i = b_utf8_4_3 \/ byte_status_i = b_utf8_4_4 , byte_status_i = b_unknown <-> encoding_i = e_unknown , % % byte_status -> char_start % byte_status_i = b_ascii -> char_start_i = 1 , byte_status_i = b_utf8_2_1 -> char_start_i = 1 , byte_status_i = b_utf8_2_2 -> char_start_i = 0 , byte_status_i = b_utf8_3_1 -> char_start_i = 1 , byte_status_i in {b_utf8_3_2, b_utf8_3_3} -> char_start_i = 0 , byte_status_i = b_utf8_4_1 -> char_start_i = 1 , byte_status_i in {b_utf8_4_2, b_utf8_4_3, b_utf8_4_4} -> char_start_i = 0 , byte_status_i = b_euc1 -> char_start_i = 1 , byte_status_i = b_euc2 -> char_start_i = 0 , byte_status_i = b_sjis1_1 -> char_start_i = 1 , byte_status_i = b_sjis2_1 -> char_start_i = 1 , byte_status_i = b_sjis2_2 -> char_start_i = 0 , % % unknown % byte_status_i = b_unknown -> char_start_i = 1 ])); predicate stream_to_status(int:i, array[int] of int: stream, var int: byte_status_i) = ( forall([ % % ASCII % if (stream[i] < 128) then true else byte_status_i != b_ascii endif , % % UTF-8 % % C2-DF, 80-BF if (i >= len) then byte_status_i != b_utf8_2_1 else if ((194 <= stream[i] /\ stream[i] <= 223) /\ (128 <= stream[i+1] /\ stream[i+1] <= 191)) then true else byte_status_i != b_utf8_2_1 endif endif , % E0-EF, 80-BF, 80-BF if (i >= len - 1) then byte_status_i != b_utf8_3_1 else if ((224 <= stream[i] /\ stream[i] <= 239) /\ (128 <= stream[i+1] /\ stream[i+1] <= 191) /\ (128 <= stream[i+2] /\ stream[i+2] <= 191)) then true else byte_status_i != b_utf8_3_1 endif endif , % F0-F7, 80-BF, 80-BF, 80-BF if (i >= len - 2) then byte_status_i != b_utf8_4_1 else if (240 <= stream[i] /\ stream[i] <= 247 /\ 128 <= stream[i+1] /\ stream[i+1] <= 191 /\ 128 <= stream[i+2] /\ stream[i+2] <= 191 /\ 128 <= stream[i+3] /\ stream[i+3] <= 191) then true else byte_status_i != b_utf8_4_1 endif endif , % % EUC-JP (CP51932) % (A1-A8, AD, B0-F4, F9-FC), (A1-FE) % if (i >= len) then byte_status_i !=b_euc1 else if ((161 <= stream[i] /\ stream[i] <= 168) \/ (173 = stream[i]) \/ (176 <= stream[i] /\ stream[i] <= 244) \/ (249 <= stream[i] /\ stream[i] <= 252)) /\ (161 <= stream[i+1] /\ stream[i+1] <= 254) then true else byte_status_i != b_euc1 endif endif , % % SJIS % % (A1-DF) if (161 <= stream[i] /\ stream[i] <= 223) then true else byte_status_i != b_sjis1_1 endif , % (81-9F, E0-FC), (40-7E, 80-FC) if (i >= len) then byte_status_i != b_sjis2_1 else if ((129 <= stream[i] /\ stream[i] <= 159) \/ (224 <= stream[i] /\ stream[i] <= 252)) /\ ((64 <= stream[i+1] /\ stream[i+1] <= 126) \/ (128 <= stream[i+1] /\ stream[i+1] <= 252)) then true else byte_status_i != b_sjis2_1 endif endif ]) ); predicate stream_to_status_old(int:i, array[int] of int: stream, var int: byte_status_i) = ( forall([ % % ASCII % if (stream[i] < 128) then true else byte_status_i != b_ascii endif , % % UTF-8 % % C2-DF, 80-BF if (i >= len) then byte_status_i != b_utf8_2_1 else byte_status_i = b_utf8_2_1 -> ((194 <= stream[i] /\ stream[i] <= 223) /\ (128 <= stream[i+1] /\ stream[i+1] <= 191)) endif , % E0-EF, 80-BF, 80-BF if (i >= len - 1) then byte_status_i != b_utf8_3_1 else byte_status_i = b_utf8_3_1 -> ((224 <= stream[i] /\ stream[i] <= 239) /\ (128 <= stream[i+1] /\ stream[i+1] <= 191) /\ (128 <= stream[i+2] /\ stream[i+2] <= 191)) endif , % F0-F7, 80-BF, 80-BF, 80-BF if (i >= len - 2) then byte_status_i != b_utf8_4_1 else byte_status_i = b_utf8_4_1 -> (240 <= stream[i] /\ stream[i] <= 247 /\ 128 <= stream[i+1] /\ stream[i+1] <= 191 /\ 128 <= stream[i+2] /\ stream[i+2] <= 191 /\ 128 <= stream[i+3] /\ stream[i+3] <= 191) endif , % % EUC-JP (CP51932) % (A1-A8, AD, B0-F4, F9-FC), (A1-FE) % if (i >= len) then byte_status_i !=b_euc1 else byte_status_i = b_euc1 -> ((161 <= stream[i] /\ stream[i] <= 168) \/ (173 = stream[i]) \/ (176 <= stream[i] /\ stream[i] <= 244) \/ (249 <= stream[i] /\ stream[i] <= 252)) /\ (161 <= stream[i+1] /\ stream[i+1] <= 254) endif , % % SJIS % % (A1-DF) byte_status_i = b_sjis1_1 -> (161 <= stream[i] /\ stream[i] <= 223) , % (81-9F, E0-FC), (40-7E, 80-FC) if (i >= len) then byte_status_i != b_sjis2_1 else byte_status_i = b_sjis2_1 -> ((129 <= stream[i] /\ stream[i] <= 159) \/ (224 <= stream[i] /\ stream[i] <= 252)) /\ ((64 <= stream[i+1] /\ stream[i+1] <= 126) \/ (128 <= stream[i+1] /\ stream[i+1] <= 252)) endif ]) ); predicate link_statuses(var int: byte_status_i, var int: byte_status_im1) :: presolve(autotable) = ( forall([ % % UTF-8 % % C2-DF, 80-BF byte_status_i == b_utf8_2_2 <-> byte_status_im1 == b_utf8_2_1 , % E0-EF, 80-BF, 80-BF byte_status_i == b_utf8_3_2 <-> byte_status_im1 == b_utf8_3_1 , byte_status_i == b_utf8_3_3 <-> byte_status_im1 == b_utf8_3_2 , % F0-F7, 80-BF, 80-BF, 80-BF byte_status_i == b_utf8_4_2 <-> byte_status_im1 == b_utf8_4_1 , byte_status_i == b_utf8_4_3 <-> byte_status_im1 == b_utf8_4_2 , byte_status_i == b_utf8_4_4 <-> byte_status_im1 == b_utf8_4_3 , % % EUC-JP (CP51932) % (A1-A8, AD, B0-F4, F9-FC), (A1-FE) % byte_status_i == b_euc2 <-> byte_status_im1 = b_euc1 , % % SJIS % % (A1-DF) % (81-9F, E0-FC), (40-7E, 80-FC) byte_status_i == b_sjis2_2 <-> byte_status_im1 == b_sjis2_1 ]) ); predicate init(var int: byte_status_i, int: i) = ( forall([ % % UTF-8 % % C2-DF, 80-BF if (i < 2) then byte_status_i != b_utf8_2_2 else true endif , % E0-EF, 80-BF, 80-BF if (i < 2) then byte_status_i != b_utf8_3_2 else true endif , if (i < 3) then byte_status_i != b_utf8_3_3 else true endif , % F0-F7, 80-BF, 80-BF, 80-BF if (i < 2) then byte_status_i != b_utf8_4_2 else true endif , if (i < 3) then byte_status_i != b_utf8_4_3 else true endif , if (i < 4) then byte_status_i != b_utf8_4_4 else true endif , % % EUC-JP (CP51932) % (A1-A8, AD, B0-F4, F9-FC), (A1-FE) % if (i < 2) then byte_status_i != b_euc2 else true endif , % % SJIS % % (A1-DF) % (81-9F, E0-FC), (40-7E, 80-FC) if (i < 2) then byte_status_i != b_sjis2_2 else true endif ]) ); %------------------------------------------------------------------------------% % Search and Solve solve :: seq_search([ int_search(byte_status, first_fail, indomain_split, complete), int_search(char_start, input_order, indomain_max, complete), int_search(encoding, first_fail, indomain_split, complete) ]) minimize objective; %------------------------------------------------------------------------------% % Output output [ "encoding = ", show(encoding), ";\n", "byte_status = ", show(byte_status), ";\n", "char_start = ", show(char_start), ";\n", "constraint objective = ", show(objective), ";\n" ]