// Copyright(c) 2006, Shinichi Morishita. All Rights Reserved. public class Chapter4_SuffixArrays { // Larsson-Sadakane Algorithm public static void suffixArray_LS(int[] S, int[] SA){ // Assume that all elements in S are positive except that the last is zero. int n = S.length; int[] ISA = new int[n]; // Basic step. radixSort_LS(S, SA, ISA); //Iterative step. boolean iteration; int h=1; do{ iteration = false; for(int i=0; i r) return; // Exit when the empty range is input. if(l == r){ // Set ISA[SA[l]] to l when the input is singleton. ISA[SA[l]] = l; return; } // Choose a pivot from ISA[SA[]+h]. int v = ISA[SA[l+(int)Math.floor(Math.random()*(r-l+1))]+h]; int i = l; int mi = l; int j = r; int mj = r; int tmp; for (;;) { // Compare values according to ISA[SA[]+h] for(; i <= j && ISA[SA[i]+h] <= v; i++) if(ISA[SA[i]+h] == v){ tmp = SA[i]; SA[i] = SA[mi]; SA[mi] = tmp; mi++; } for(; i <= j && v <= ISA[SA[j]+h]; j--) if(ISA[SA[j]+h] == v){ tmp = SA[j]; SA[j] = SA[mj]; SA[mj] = tmp; mj--; } if (i > j) break; tmp = SA[i]; SA[i] = SA[j]; SA[j] = tmp; i++; j--; } for(mi--, i--; l <= mi; mi--, i--){ tmp = SA[i]; SA[i] = SA[mi]; SA[mi] = tmp; } for(mj++, j++; mj <= r; mj++, j++){ tmp = SA[j]; SA[j] = SA[mj]; SA[mj] = tmp; } ternary_split_quicksort_LS(SA, l, i, ISA, h); for(int k = i+1; k < j; k++) ISA[SA[k]] = j-1; // update ISA ternary_split_quicksort_LS(SA, j, r, ISA, h); } public static int digit(int num, int j, int m){ int quotient = num; int remainder = 0; for(int i=0; i j, respectively. if(i0 < j0) return -1; else if(i0 == j0) if(i1 < j1) return -1; else if(i1 == j1) return 0; else return 1; else // i0 > j0 return 1; } public static int compare(int i0, int i1, int i2, int j0, int j1, int j2){ // return -1, 0, 1 if i < j, i == j, and i > j, respectively. if(i0 < j0) return -1; else if(i0 == j0) return compare(i1, i2, j1, j2); else return 1; } // Simple but naive binary search algorithm for the query. public static int searchLeftmost(int[] S, int[] SA, int[] query){ // Exit if the query, which is greater than "$", is outside the scope. if(compare(S,SA[SA.length-1], query) == -1) return -1; int left, right; for(left = 0, right = SA.length-1; left+1 < right; ){ int middle = (left + right) / 2; if(compare(S, SA[middle], query) == -1) left = middle; else right = middle; } if( compare(S, SA[right], query) == 0 ) return right; else return -1; // No occurrences of the query. } public static int compare(int S[], int start, int[] query){ // Return 0 if the query occurs as a prefix of the suffix S[start]. // Otherwise, return -1 and 1 if the suffix from "start" is lexicographically // lower or higher than query. for(int i=0; start + i < S.length && i < query.length; i++) if(S[start+i] < query[i]) return -1; else if(S[start+i] > query[i]) return 1; return 0; } // Advanced binary search algorithm for the query with lcp information. public static int searchLeftmost(int[] S, int[] SA, int[] query, int[] LCP, int[] LCP_AUX){ // Compute the lcp values between the query and both end suffixes. int left = 0; int lcp_left_query = 0; int right = SA.length-1; int lcp_right_query = lcp0(0, query, S, SA[right]); // Exit if the query, which is greater than "$", is outside the scope. if(!less_eq(lcp_right_query, query, S, SA[right])) return -1; // Binary search. for(int middle=(left+right)/2; left+1 < right; middle=(left+right)/2){ if(lcp1(left, middle, LCP, LCP_AUX) >= lcp1(middle, right, LCP, LCP_AUX)){ if(lcp_left_query < lcp1(left, middle, LCP, LCP_AUX)) left = middle; // lcp_left_query remains unchanged. else if(lcp_left_query > lcp1(left, middle, LCP, LCP_AUX)){ right = middle; lcp_right_query = lcp1(left, middle, LCP, LCP_AUX); }else{ // Set the lcp of the query and the middle suffix to i. int i = lcp0( lcp1(left, middle, LCP, LCP_AUX), query, S, SA[middle]); if(less_eq(i, query, S, SA[middle])){ right = middle; lcp_right_query = i; }else{ left = middle; lcp_left_query = i;} } }else{ if(lcp_right_query < lcp1(middle, right, LCP, LCP_AUX)){ right = middle; // lcp_right_query remains unchanged. }else if(lcp_right_query > lcp1(middle, right, LCP, LCP_AUX)){ left = middle; lcp_left_query = lcp1(middle, right, LCP, LCP_AUX); }else{ // Set the lcp of the query and the middle suffix to i. int i = lcp0( lcp1(middle, right, LCP, LCP_AUX), query, S, SA[middle]); if(less_eq(i, query, S, SA[middle])){ right = middle; lcp_right_query = i; }else{ left = middle; lcp_left_query = i;} } } } if(lcp_right_query == query.length) return right; else return -1; } public static boolean less_eq(int lcp, int[] query, int[] S, int start){ // Return true if the query is lower than or equal to the suffix. if(lcp == query.length) return true; if(start+lcp < S.length && query[lcp] < S[start+lcp]) return true; return false; } public static int lcp0(int offset, int[] query, int[] S, int start){ // Compute lcp. int i = offset; for(; i < query.length && start+i < S.length && query[i] == S[start+i]; i++); return i; } public static int lcp1(int i, int j, int[] LCP, int[] LCP_AUX){ // Lookup lcp tables. if(i+1 == j) return LCP[j]; else return LCP_AUX[(i+j)/2]; } // Build longest common prefixes. public static void buildLcp(int[] S, int[] SA, int[] LCP, int[] LCP_AUX){ // Assume that the last symbol of S is "$", and SA is the suffix array of S. // int[] LCP = new int[S.length]; int[] LCP_AUX = new int[S.length-1]; int n = SA.length; int[] ISA = new int[n]; for(int i=0; i 0) lcp--; } // Compute lcp values for all possible intervals and put them into LCP_AUX // according to LCP_AUX[(l+r)/2] = lcp(l,r) if l+1 < r. buildLcp1(0, SA.length-1, LCP, LCP_AUX); } public static int buildLcp1(int l, int r, int[] LCP, int[] LCP_AUX){ if(l+1 == r) return LCP[r]; else{ int v = Math.min(buildLcp1(l, (l+r)/2, LCP, LCP_AUX), buildLcp1((l+r)/2, r, LCP, LCP_AUX)); LCP_AUX[(l+r)/2] = v; return v; } } // Calculate occurrence frequencies of k-mers public static int[] occurrence_frequency(int[] SA, int[] LCP, int k){ int[] freq = new int[LCP.length]; int runLength = 1; for(int i = 0; i < LCP.length; i++){ // Checks if k > lcp(i,i+1) or not. if((i < LCP.length-1 && k > LCP[i+1]) || i == LCP.length-1){ // The suffix at SA[i+1] is new, or the end of LCP is hit. // Store the occurrence frequency of the current k-mer. for(int j = i+1-runLength; j <= i; j++) freq[SA[j]] = runLength; // Initialize for the new k-mer. runLength = 1; }else // Continue to search the current k-mer. runLength++; } return freq; } // Calculate the longest common factor public static int lcf(int[] ISA, int[] LCP, int left, int right){ // The query ranges from left to right in S. if(!(0 <= left && left <= right && right < ISA.length)) return 0; int answer = 0; for(int k = left; k <= right; k++){ int lcp; if(ISA[k]+1 >= ISA.length) lcp = LCP[ISA[k]]; else lcp = Math.max(LCP[ISA[k]], LCP[ISA[k]+1]); answer = Math.max(answer, Math.min(right-k+1, lcp)); } return answer; } public static int[] nucleotides2IntArray(String targetString){ int targetLen = targetString.length(); int[] targetIntArray = new int[targetLen]; for(int i = 0; i < targetLen; i++) targetIntArray[i] = encode_char(targetString.charAt(i)); return targetIntArray; } public static int encode_char(char c){ // modified so that $ is the minimum value 0. switch (c){ case '$': return 0; case 'A': return 1; case 'C': return 2; case 'G': return 3; case 'T': return 4; default: return 5; } } public static int[] generateRandomNumbers(int targetLen){ // Generate an array in which elements are selected from {1,2,3,4} at random. int[] target = new int[targetLen]; // modified so that $ is the minimum value 0. for(int i=0; i -1) System.out.println("The naive algorithm find the leftmost boundary of the block of the query occurrences at "+qPos+" in SA."); else System.out.println("The naive algorithm does not find the query."); // Build lcp array. int[] LCP = new int[S.length]; int[] LCP_AUX = new int[S.length-1]; for(int i=0; i -1) System.out.println("The advanced algorithm find the leftmost boundary of the block of the query occurrences at "+qPos+" in SA."); else System.out.println("The advanced algorithm does not find the query."); // Build the list of occurrence frequencies of all k-mers. int k_mer = query.length; int[] freq = occurrence_frequency(SA, LCP, k_mer); /* System.out.println("Occurrence frequency of all "+k_mer+"-mers:"); for(int i=0; i