fork download
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4.  
  5. typedef unsigned int uint;
  6. typedef unsigned char byte;
  7.  
  8. uint flen( FILE* f ) {
  9. fseek( f, 0, SEEK_END );
  10. uint len = ftell(f);
  11. fseek( f, 0, SEEK_SET );
  12. return len;
  13. }
  14.  
  15. int obwt_bwts( byte* T, byte* U, int* A, int n );
  16. int obwt_unbwts( byte* T, byte* U, int* LF, int n );
  17.  
  18. // Usage: BWTS c/d <inpfile> <outfile>
  19. int main( int argc, char** argv ) {
  20.  
  21. uint mode = (argc<2) ? 0 : (argv[1][0]=='d');
  22. FILE* f = argc<3 ? 0 : fopen( argv[2], "rb" ); if( f==0 ) f=stdin;
  23. FILE* g = argc<4 ? 0 : fopen( argv[3], "wb" ); if( g==0 ) g=stdout;
  24.  
  25. uint inplen = flen(f);
  26. byte* inpbuf = new byte[inplen]; if( inpbuf==0 ) return 1;
  27. byte* outbuf = new byte[inplen]; if( outbuf==0 ) return 1;
  28. int * ptrtbl = new int [inplen]; if( ptrtbl==0 ) return 1;
  29.  
  30. fread( inpbuf, 1,inplen, f );
  31.  
  32. mode ?
  33. obwt_unbwts( inpbuf, outbuf, ptrtbl, inplen ) :
  34. obwt_bwts( inpbuf, outbuf, ptrtbl, inplen );
  35.  
  36. fwrite( outbuf, 1,inplen, g );
  37.  
  38. }
  39.  
  40. #define chr(i) (cs==sizeof(int) ? ((int*)T)[i]:((byte*)T)[i])
  41. #define isLMS(_a) ((bvec_get(F1,(_a))!=0) || ((bvec_get(F2,(_a))!=0) && (bvec_get(F2,(_a)-1)==0)))
  42.  
  43. int bvec_get( byte* B, int i ) {
  44. return ( B[i >> 3] >> ( i& 7 ) )& 1;
  45. }
  46.  
  47. void bvec_set( byte* B, int i, int b ) {
  48. if( b ) {
  49. B[i>>3] |= byte( 1U<<(i&7) );
  50. } else {
  51. B[i>>3] &= byte(~(1U<<(i&7)));
  52. }
  53. }
  54.  
  55. uint bvec_next( byte* B, uint i ) {
  56. uint j;
  57. uint c;
  58. i += 1;
  59. j = i >> 3;
  60. c = B[j] >> ( i& 7 );
  61. if( c==0 ) {
  62. i += 8-(i&7);
  63. j += 1;
  64. for(; (c=B[j])==0; j++, i+=8 );
  65. }
  66. for(; (c&1)==0; i++, c>>=1 );
  67. return i;
  68. }
  69.  
  70. int bvec_prev( byte* B, int i ) {
  71. int j;
  72. uint c;
  73. i -= 1;
  74. if ( 0 <= i ) {
  75. j = i >> 3;
  76. c = ( B[j] << ( 7-( i& 7 ) ) )& 0xff;
  77. if ( c == 0 ) {
  78. i -= ( i& 7 ) + 1;
  79. j -= 1;
  80. for ( ; ( 0 <= j ) && ( ( c = B[j] ) == 0 ); --j, i -= 8 ){}
  81. if ( c == 0 ) {
  82. c = 128;
  83. }
  84. }
  85. for ( ; ( c& 128 ) == 0; --i, c <<= 1 ){}
  86. }
  87. return i;
  88. }
  89.  
  90.  
  91. void getCounts( byte* T, int* C, int n, int k, int cs ) {
  92. int i;
  93. for( i=0; i<k; ++i ) C[i]=0;
  94. for( i=0; i<n; ++i ) ++C[chr(i)];
  95. }
  96.  
  97. void getBuckets( int* C, int* B, int k, int end ) {
  98. int i, sum = 0;
  99. if ( end ) {
  100. for( i=0; i<k; ++i ) {
  101. sum += C[i];
  102. B[i] = sum;
  103. }
  104. } else {
  105. for( i=0; i<k; ++i ) {
  106. sum += C[i];
  107. B[i] = sum - C[i];
  108. }
  109. }
  110. }
  111.  
  112.  
  113. void induceSA( byte* T, int* SA, int* C, int* B, byte* F1, byte* F2, byte* F3, int n, int k, int cs ) {
  114. int i, j, p;
  115.  
  116. if( C==B ) getCounts( T, C, n, k, cs );
  117. getBuckets( C, B, k, 0 );
  118.  
  119. p = bvec_prev( F3, n );
  120.  
  121. for( i=0; i<n; ++i ) {
  122.  
  123. while( (p>=0) && (B[chr(p)]==i) ) {
  124. j = bvec_next( F1, p ) - 1;
  125. SA[B[chr(j)]++] = j;
  126. p = bvec_prev( F3, p );
  127. }
  128. j = SA[i];
  129. if( j<0 ) continue;
  130.  
  131. if( bvec_get(F1,j)==0 ) j-=1;
  132. else j=bvec_next(F1,j)-1;
  133.  
  134. if( bvec_get(F2,j)==0 ) SA[B[chr(j)]++]=j;
  135. }
  136.  
  137. if( C==B ) getCounts( T, C, n, k, cs );
  138.  
  139. getBuckets( C, B, k, 1 );
  140.  
  141. for( i=n-1; i>=0; i-- ) {
  142. j = SA[i];
  143. if( j<0 ) continue;
  144. if( bvec_get(F1,j)==0 ) if( bvec_get(F2,--j)!=0 ) SA[--B[chr(j)]]=j;
  145. }
  146. }
  147.  
  148. int lw_suffixsort( byte* T, int* SA, byte* F1, int fs, int n, int k, int cs ) {
  149.  
  150. byte* F2, * F3, * F4;
  151. int* C, * B, * RA;
  152. int i, j, c, m, p, q, plen, qlen, name;
  153. int c0, c1;
  154. int diff;
  155.  
  156. F2 = ( byte* )calloc( ( n + 7 ) / 8, sizeof(byte));
  157. F3 = ( byte* )calloc( ( n / 8+1 ), sizeof(byte));
  158. if ( ( F2 == NULL ) || ( F3 == NULL ) ) {
  159. free( F2 );
  160. free( F3 );
  161. return - 2;
  162. }
  163. for ( p = 0; p < n; p = q ) {
  164. q = bvec_next( F1, p );
  165. for ( i = q - 2, c = 0, c1 = chr( q - 1 ), m = 0; p <= i; --i, c1 = c0 ) {
  166. bvec_set( F2, i + 1, c );
  167. if ( ( c0 = chr( i ) ) < ( c1 + c ) ) {
  168. c = 1;
  169. } else if ( c != 0 ) {
  170. m += 1;
  171. c = 0;
  172. }
  173. }
  174. bvec_set( F2, p, 1 );
  175. if ( ( m == 0 ) && ( c == 0 ) ) {
  176. bvec_set( F3, p, 1 );
  177. }
  178. }
  179. bvec_set( F3, n, 1 );
  180.  
  181. if ( k <= fs ) {
  182. C = SA + n;
  183. B = ( k <= ( fs - k ) ) ? C + k: C;
  184. } else if ( ( C = B = ( int* )malloc( k* sizeof( int ) ) ) == NULL ) {
  185. free( F2 );
  186. free( F3 );
  187. return - 2;
  188. }
  189. getCounts( T, C, n, k, cs );
  190. getBuckets( C, B, k, 1 );
  191. for( i=0; i<n; ++i ) {
  192. SA[i] = - 1;
  193. }
  194. for( i=0; i<n; ++i ) {
  195. if ( isLMS( i ) && ( bvec_get( F3, i ) == 0 ) ) {
  196. SA[--B[chr( i )]] = i;
  197. }
  198. }
  199. induceSA( T, SA, C, B, F1, F2, F3, n, k, cs );
  200. if ( fs < k ) {
  201. free( C );
  202. }
  203.  
  204. for ( i = 0, m = 0; i<n; ++i ) {
  205. p = SA[i];
  206. if ( isLMS( p ) && ( bvec_get( F3, p ) == 0 ) ) {
  207. SA[m++] = p;
  208. }
  209. }
  210. for ( i = m; i<n; ++i ) {
  211. SA[i] = 0;
  212. }
  213.  
  214. for ( p = 0; p < n; p = q ) {
  215. q = bvec_next( F1, p );
  216. for ( i = q - 2, c = 0, c1 = chr( q - 1 ), j = q; p <= i; --i, c1 = c0 ) {
  217. if ( ( c0 = chr( i ) ) < ( c1 + c ) ) {
  218. c = 1;
  219. } else if ( c != 0 ) {
  220. SA[m + ( ( i + 1 ) >> 1 )] = j - i - 1;
  221. j = i + 1;
  222. c = 0;
  223. }
  224. }
  225. if ( ( j < q ) || ( c != 0 ) ) {
  226. SA[m + ( p >> 1 )] = j - p;
  227. }
  228. }
  229.  
  230. for ( i = 0, name = 0, q = n, qlen = 0; i<m; ++i ) {
  231. p = SA[i], plen = SA[m + ( p >> 1 )], diff = 1;
  232. if ( plen == qlen ) {
  233. for ( j = 0; ( j < plen ) && ( chr( p + j ) == chr( q + j ) ); j++ ){}
  234. if ( j == plen ) {
  235. diff = 0;
  236. }
  237. }
  238. if ( diff != 0 ) {
  239. ++name, q = p, qlen = plen;
  240. }
  241. SA[m + ( p >> 1 )] = name;
  242. }
  243.  
  244. if ( name < m ) {
  245. F4 = ( byte* )calloc( ( m / 8+1 ), sizeof(byte));
  246. if ( F4 == NULL ) {
  247. free( F2 );
  248. free( F3 );
  249. return - 1;
  250. }
  251. for ( i = 0, j = 0; i<n; ++i ) {
  252. if ( isLMS( i ) && ( bvec_get( F3, i ) == 0 ) ) {
  253. bvec_set( F4, j++, bvec_get( F1, i ) );
  254. }
  255. }
  256. bvec_set( F4, m, 1 );
  257. RA = SA + n + fs - m;
  258. for ( i = n - 1, j = m - 1; m <= i; --i ) {
  259. if ( SA[i] != 0 ) {
  260. RA[j--] = SA[i] - 1;
  261. }
  262. }
  263. if ( lw_suffixsort( ( byte* )RA, SA, F4, fs + n - m * 2, m, name, sizeof( int ) ) != 0 ) {
  264. free( F2 );
  265. free( F3 );
  266. free( F4 );
  267. return - 2;
  268. }
  269. free( F4 );
  270. for ( i = 0, j = 0; i<n; ++i ) {
  271. if ( isLMS( i ) && ( bvec_get( F3, i ) == 0 ) ) {
  272. RA[j++] = i;
  273. }
  274. }
  275. for( i=0; i<m; ++i ) {
  276. SA[i] = RA[SA[i]];
  277. }
  278. }
  279.  
  280. if( k<=fs ) {
  281. C = SA + n;
  282. B = ( k <= ( fs - k ) ) ? C + k: C;
  283. } else if ( ( C = B = ( int* )malloc( k* sizeof( int ) ) ) == NULL ) {
  284. free( F2 );
  285. free( F3 );
  286. return -2;
  287. }
  288.  
  289. getCounts( T, C, n, k, cs );
  290. getBuckets( C, B, k, 1 );
  291.  
  292. for( i=m; i<n; i++ ) SA[i]=-1;
  293.  
  294. for ( i = m - 1; 0 <= i; --i ) {
  295. j = SA[i], SA[i] = - 1;
  296. SA[--B[chr( j )]] = j;
  297. }
  298. induceSA( T, SA, C, B, F1, F2, F3, n, k, cs );
  299.  
  300. if( fs<k ) free( C );
  301.  
  302. free( F2 );
  303. free( F3 );
  304. return 0;
  305. }
  306.  
  307. int obwt_bwts( byte* T, byte* U, int* A, int n ) {
  308. byte* F;
  309. int i, j, k, p, q;
  310. int c1, c2;
  311.  
  312. if( (T==0) || (U==0) || (A==0) || (n<0) ) return -1;
  313.  
  314. if( n<=1 ) { if( n==1 ) U[0] = T[0]; return 0; }
  315.  
  316. F = (byte*)calloc( (n/8+1), sizeof(byte) ); if( F==NULL ) return -2;
  317.  
  318. for( i=0, j=k=1; j<=n; ) {
  319. for ( p = i, q = j;; ) {
  320. c1 = ( p < n ) ? T[p]: - 1;
  321. c2 = ( q < n ) ? T[q]: - 1;
  322. ++p, ++q;
  323. if ( c1 != c2 ) {
  324. break;
  325. }
  326. if ( p == j ) {
  327. k = q;
  328. p = i;
  329. }
  330. }
  331. if ( c1 < c2 ) {
  332. j = k = q;
  333. } else {
  334. bvec_set( F, i, 1 );
  335. i = k;
  336. j = ( k += 1 );
  337. }
  338. }
  339. bvec_set( F, n, 1 );
  340.  
  341. if ( lw_suffixsort( T, A, F, 0, n, 256, sizeof(byte)) != 0 ) {
  342. free( A );
  343. free( F );
  344. return - 2;
  345. }
  346.  
  347. if ( T == U ) {
  348. for( i=0; i<n; ++i ) {
  349. p = A[i];
  350. if ( bvec_get( F, p ) == 0 ) {
  351. p -= 1;
  352. } else {
  353. p = bvec_next( F, p ) - 1;
  354. }
  355. c1 = T[i];
  356. U[i] = byte( ( i <= p ) ? T[p]: A[p] );
  357. A[i] = c1;
  358. }
  359. } else if ( ( ( T < U ) && ( U < ( T + n ) ) ) || ( ( U < T ) && ( T < ( U + n ) ) ) ) {
  360. for( i=0; i<n; ++i ) {
  361. p = A[i];
  362. if ( bvec_get( F, p ) == 0 ) {
  363. p -= 1;
  364. } else {
  365. p = bvec_next( F, p ) - 1;
  366. }
  367. A[i] = T[p];
  368. }
  369. for( i=0; i<n; ++i ) {
  370. U[i] = ( byte )A[i];
  371. }
  372. } else {
  373. for( i=0; i<n; ++i ) {
  374. p = A[i];
  375. if ( bvec_get( F, p ) == 0 ) {
  376. p -= 1;
  377. } else {
  378. p = bvec_next( F, p ) - 1;
  379. }
  380. U[i] = T[p];
  381. }
  382. }
  383.  
  384. free( F );
  385.  
  386. return 0;
  387. }
  388.  
  389.  
  390. int obwt_unbwts( byte* T, byte* U, int* LF, int n ) {
  391. int C[256];
  392. byte* V;
  393. int i, j, p, t;
  394.  
  395. if( (T==0) || (U==0) || (LF==0) || (n<0) ) return -1;
  396.  
  397. if( n<=1 ) { if( n==1 ) U[0]=T[0]; return 0; }
  398.  
  399. V = new byte[n];
  400.  
  401. if( LF && V ) {
  402.  
  403. for( i=0; i<256; ++i ) C[i] = 0;
  404.  
  405. for( i=0; i<n; ++i ) C[V[i] = T[i]] += 1;
  406.  
  407. for( i=0, j=0; i<256; ++i ) { t=C[i]; C[i]=j; j+=t; }
  408.  
  409. for( i=0; i<n; ++i ) LF[i] = C[V[i]]++;
  410.  
  411. for( i=0, j=n-1; j>=0; ++i ) {
  412. if( LF[i]>=0 ) {
  413. p = i;
  414. do {
  415. U[j--] = V[p];
  416. t = LF[p];
  417. LF[p]=-1;
  418. p = t;
  419. } while( LF[p]>=0 );
  420. }
  421. }
  422. }
  423.  
  424. if( V ) delete[] V;
  425.  
  426. return 0;
  427. }
  428.  
  429. #undef chr
  430. #undef isLMS
  431.  
  432.  
Success #stdin #stdout 0s 3480KB
stdin
abracadabra for BWTS
stdout
Sr TWBardrcaaaa fobb