You selected exactp.pl

% Fisher's exact test
% exactp.pl
% developed on SWI-prolog 6.2.2, 64 bits 
% By Kenryo Indo
% 2013.5.30 (modified: 5.31, for relatively large numbers)

% examples. 

sample_data( sample1).
sample_data( sample2).

%sample( 1). 2 x 2 contingency table [2]

sample1:class( c1, 'Men').
sample1:class( c2,'Women').

sample1:group( r1, 'Dieting').
sample1:group( r2, 'Non-dieting').

sample1:table( r1, c1, 1).
sample1:table( r1, c2, 9).
sample1:table( r2, c1, 11).
sample1:table( r2, c2, 3).

%sample( 2).  2 x 2 contingency table [3]

sample2:class( c1, 'tooth decay').
sample2:class( c2, 'no cavity').

sample2:group( r1, 'having a sweet tooth').
sample2:group( r2, 'not fond of sweets').

sample2:table( r1, c1, 13).
sample2:table( r1, c2, 4).
sample2:table( r2, c1, 6).
sample2:table( r2, c2, 14).


% The p-value : assuming the margins are fixed, 
% Under a null hypothesis of independence, 
% p-value can be computed by a hypergeometric distribution
% of the numbers in the cells of the table.

factorial_number( T, F):-
	 ascending_numbers( T, H),
	 product_list( H, E),
	 F is E.

ascending_numbers( T, H):-
	 length( L, T),
	 findall( I, nth1( I, L, _), H).

product_list( [], 1).  % <-- This clause was missing in the earlier version. 
product_list( [ X | T], F):-
	 product_list( T, X, F).

product_list( [], X, X).
product_list( [X|T], Y, F):-
	 Z = X * Y,
	 product_list( T, Z, F).

/*

?- product_list([1,2,3],L).
L = 3* (2*1).

 ?- length(L,10),nth1(J,L,_),ascending_numbers(J,X),nl,write(X),fail.

[1]
[1,2]
[1,2,3]
[1,2,3,4]
[1,2,3,4,5]
[1,2,3,4,5,6]
[1,2,3,4,5,6,7]
[1,2,3,4,5,6,7,8]
[1,2,3,4,5,6,7,8,9]
[1,2,3,4,5,6,7,8,9,10]
false.

 ?- length(L,10),nth0(J,[_|L],_),factorial_number(J,X),nl,write(fact(J)=X),fail.

fact(0)=1
fact(1)=1
fact(2)=2
fact(3)=6
fact(4)=24
fact(5)=120
fact(6)=720
fact(7)=5040
fact(8)=40320
fact(9)=362880
fact(10)=3628800
false.


*/

marginal_frequencies( X, [T1,T2,S1,S2]):-
	 X = [X11, X12, X21, X22],
	 T1 is X11 + X12,
	 T2 is X21 + X22,
	 S1 is X11 + X21,
	 S2 is X12 + X22.

observ_p( X, P):-
	 sum_list( X, N),
	 X = [X11, X12 | _],
	 marginal_frequencies( X, [T1,_,S1,S2]),
	 combin_num( N, T1, CD),
	 combin_num( S1, X11, CN1),
	 combin_num( S2, X12, CN2),
	 P is CN1 * CN2 / CD.

/*
observ_p_old( X, P):-
	 X = [X11, X12, X21, X22],
	 sum_list( X, N),
	 marginal_frequencies( X, [T1,T2,S1,S2]),
	 ascending_numbers( N, FN),
	 ascending_numbers( X11, F11),
	 ascending_numbers( X12, F12),
	 ascending_numbers( X21, F21),
	 ascending_numbers( X22, F22),
	 append( [FN, F11, F12, F21, F22], LD),
	 ascending_numbers( T1, FT1),
	 ascending_numbers( T2, FT2),
	 ascending_numbers( S1, FS1),
	 ascending_numbers( S2, FS2),
	 append( [ FT1, FT2, FS1, FS2], LN),
	 ord_subtract( LN, LD, LN1),
	 ord_subtract( LD, LN, LD1),
	 product_list( LN1, Numerator), 
	 product_list( LD1, Denominator), 
	 P is Numerator / Denominator.
*/


combin_num( N, K, C):-
	 length( L, N),
	 append( A, B, L),
	 length( A, K),
	 length( B, M),
	 ascending_numbers( N, NL),
	 ascending_numbers( K, DL1),
	 ascending_numbers( M, DL2),
	 append( DL1, DL2, DL),
	 ord_subtract( NL, DL, F),
	 ord_subtract( DL, NL, G),
	 product_list( F, Numerator), 
	 product_list( G, Denominator), 
	 C is Numerator / Denominator.

sample_table( K, X):-
	 sample_data( K),
	 findall( A, K:table(  _, _, A), X).



/*

?- append([[1,2],[1,2],[1,2,3]],K).
K = [1, 2, 1, 2, 1, 2, 3].

 ?- combin_num( 5,K,P), nl, write(K->P),fail.

0->1
1->5
2->10
3->10
4->5
5->1
false.

% for a 2 x 2 table in [4], (3, 2, 1, 4)

 ?- X=[3,2,1,4], marginal_frequencies(X,W).
X = [3, 2, 1, 4],
W = [5, 5, 4, 6].

 ?- X=[3,2,1,4], observ_p(X,P).
X = [3, 2, 1, 4],
P = 0.23809523809523808.


?- sample_table(K,X).
K = sample1,
X = [1, 9, 11, 3] ;
K = sample2,
X = [13, 4, 6, 14].

?- sample_table(K,X), observ_p(X,P).
K = sample1,
X = [1, 9, 11, 3],
P = 0.001346076187912236 ;
K = sample2,
X = [13, 4, 6, 14],
P = 0.005219867675736516.


*/


solve_sum( A, B, C):-
	 length( L, C),
	 append( G, H, L),
	 length( G, A),
	 length( H, B).
	  
generate_contingency_table( [T1,T2,S1,S2], X):-
	 X = [X11, X12, X21, X22],
	 solve_sum( X11, X12, T1),
	 solve_sum( X11, X21, S1), % Note. This order is crucial to time comsumption 
	 solve_sum( X21, X22, T2),
	 solve_sum( X12, X22, S2).

generate_contingency_table_from( X, W, Y):-
	 marginal_frequencies( X, W), 
	 generate_contingency_table( W, Y).

exact_p( X, P):-
	 observ_p( X, Q),
	 G = generate_contingency_table_from( X, _, Y),
	 findall( R, (G, observ_p( Y, R), R =< Q), L),
	 sum_list( L, P).
	 

/*

 ?- solve_sum(A, B, 2).
A = 0,
B = 2 ;
A = B, B = 1 ;
A = 2,
B = 0 ;
false.

 ?- X=[3,2,1,4], marginal_frequencies(X,W).
X = [3, 2, 1, 4],
W = [5, 5, 4, 6].

?- W=[5,5,4,6], generate_contingency_table(W,Y).
W = [5, 5, 4, 6],
Y = [0, 5, 4, 1] ;
W = [5, 5, 4, 6],
Y = [1, 4, 3, 2] ;
W = [5, 5, 4, 6],
Y = [2, 3, 2, 3] ;
W = [5, 5, 4, 6],
Y = [3, 2, 1, 4] ;
W = [5, 5, 4, 6],
Y = [4, 1, 0, 5] ;
false.

 ?- X=[3,2,1,4], exact_p(X,P).
X = [3, 2, 1, 4],
P = 0.5238095238095237.

 ?- sample_table(K,X), exact_p(X,P).

K = sample1,
X = [1, 9, 11, 3],
P = 0.0027594561852200836 ;

K = sample2,
X = [13, 4, 6, 14],
P = 0.008138143815466445.

% comparison with fisher.test using R 

> F <- fisher.test(matrix(c(3,2,1,4),nrow=2))
> F$p.value
[1] 0.5238095

> F1 <- fisher.test(matrix(c(1,9,11,3),nrow=2))
> F1$p.value
[1] 0.002759456

> F2 <- fisher.test(matrix(c(13,4,6,14),nrow=2))
> F2$p.value
[1] 0.008138144

*/



% for relatively large numbers


ln_list( L, H):-
	 findall( X,
	 (
	 	 member(W, L),
		 X is log( W)
	 ), H).

combin_num_ln( N, K, C):-
	 length( L, N),
	 append( A, B, L),
	 length( A, K),
	 length( B, M),
	 ascending_numbers( N, NL),
	 ascending_numbers( K, DL1),
	 ascending_numbers( M, DL2),
	 append( DL1, DL2, DL),
	 ord_subtract( NL, DL, F),
	 ord_subtract( DL, NL, G),
	 ln_list( F, FL), 
	 ln_list( G, GL), 
	 sum_list( FL, CN), 
	 sum_list( GL, CD), 
	 C is ( CN - CD).



observ_p_ln( X, P):-
	 sum_list( X, N),
	 X = [ X11, X12 | _],
	 marginal_frequencies( X, [T1, _,S1, S2]),
	 combin_num_ln( N, T1, CDL),
	 combin_num_ln( S1, X11, CNL1),
	 combin_num_ln( S2, X12, CNL2),
	 P is exp( CNL1 + CNL2 - CDL).

exact_p_ln( X, P):-
	 observ_p_ln( X, Q),
	 G = generate_contingency_table_from( X, _, Y),
%	 G = gen_ctab_from( X, _, Y),
	 findall( R, (G, observ_p_ln( Y, R), R =< Q), L),
	 sum_list( L, P).


/*

% for another example in [4].

 ?- X=[228,863,284,851], exact_p(X,P).

ERROR: is/2: Arithmetic: evaluation error: `float_overflow'

% Then, let us use logarithm.

?- combin_num_ln(2226, 1091,P).

P = 1538.4310457750516 .

% Incidentally, time/1 of swi-prolog may be beneficial.

?- X=[228,863,284,851], time(exact_p_ln(X,P)).
% 54,256,617 inferences, 15.553 CPU in 15.553 seconds (100% CPU, 3488431 Lips)

X = [228, 863, 284, 851],
P = 0.02334574579155282 .

% comparison to R

> F5<-  fisher.test(matrix(c(228,863,284,851),nrow=2))
> F5$p.value
[1] 0.02334575
 

% other preliminary tests

 ?- sample_table(K,X), generate_contingency_table_from( X, _, Y),
  observ_p(Y,Q),observ_p_ln(Y,P),R is P-Q,nl,write(Y;R),fail.

[0,10,12,2];2.710505431213761e-20
[1,9,11,3];2.8189256484623115e-18
[2,8,10,4];1.3877787807814457e-17
[3,7,9,5];1.6653345369377348e-16
[4,6,8,6];2.498001805406602e-16
[5,5,7,7];4.996003610813204e-16
[6,4,6,8];2.498001805406602e-16
[7,3,5,9];1.6653345369377348e-16
[8,2,4,10];1.3877787807814457e-17
[9,1,3,11];2.8189256484623115e-18
[10,0,2,12];2.710505431213761e-20
[0,17,19,1];2.0679515313825692e-24
[1,16,18,2];1.5881867761018131e-22
[2,15,17,3];2.0328790734103208e-20
[3,14,16,4];4.336808689942018e-19
[4,13,15,5];1.1275702593849246e-17
[5,12,14,6];4.163336342344337e-17
[6,11,13,7];-1.3877787807814457e-17
[7,10,12,8];8.326672684688674e-17
[8,9,11,9];-2.7755575615628914e-16
[9,8,10,10];7.771561172376096e-16
[10,7,9,11];6.106226635438361e-16
[11,6,8,12];5.551115123125783e-17
[12,5,7,13];3.8163916471489756e-17
[13,4,6,14];-4.336808689942018e-18
[14,3,5,15];9.75781955236954e-19
[15,2,4,16];4.743384504624082e-20
[16,1,3,17];2.117582368135751e-22
[17,0,2,18];1.819797347616661e-23
false.

% negligence

 ?- X = [3,2,1,4], exact_p(X,P), exact_p_ln(X,Q).

X = [3, 2, 1, 4],
P = Q, Q = 0.5238095238095237 .

 ?- sample_table(K,X), exact_p(X,P).

K = sample1,
X = [1, 9, 11, 3],
P = 0.002759456185220083 ;

K = sample2,
X = [13, 4, 6, 14],
P = 0.008138143815466445 ;
false.

% The following was possible to compute.

 ?- X=[23,86,28,85], exact_p(X,P).

X = [23, 86, 28, 85],
P = 0.5280417128466433 .

%  comparison to R

> F4<-  fisher.test(matrix(c(23,86,28,85),nrow=2))
> F4$p.value
[1] 0.5280417

*/


% References:
% [1] Fisher, R. A. (1934). Statistical Methods for Research Workers. (5th ed. and later.) Oliver and Boyd.
% [2] http://en.wikipedia.org/wiki/Fisher%27s_exact_test
% [3] http://aoki2.si.gunma-u.ac.jp/lecture/Cross/Fisher.html
% [4] http://oku.edu.mie-u.ac.jp/~okumura/stat/fishertest.php



return to front page.