fork download
  1. using System;
  2. using System.Linq;
  3. using System.Collections.Generic;
  4. using System.Text;
  5.  
  6. namespace PMS.Common.Collections
  7. {
  8. // Abstract Suffix Tree class. Defines interface for different implementations of
  9. // suffix tree construction algorithms and internal representations
  10. //
  11. // Usage example:
  12. // var tree = SuffixTree.Build("cacao", SuffixTreeAlgorithm.UkkonenLinear);
  13. // var matches = tree.Match("ca").ToArray();
  14. // Console.WriteLine(tree.ToDotNotation());
  15. //
  16. // Dot notation can be rendered here: http://w...content-available-to-author-only...z.com
  17. public abstract class SuffixTree
  18. {
  19. #region Fields
  20. protected static char TerminationCharacter = '$';
  21. #endregion
  22.  
  23. #region Api
  24. public abstract bool IsMatch(string substring);
  25.  
  26. public abstract IEnumerable<int> Match(string substring);
  27.  
  28. public abstract string ToDotNotation();
  29. #endregion
  30.  
  31. #region Construction
  32. public static SuffixTree Build(string text, SuffixTreeAlgorithm algorithm)
  33. {
  34. if (text == null)
  35. {
  36. throw new ArgumentNullException(nameof(text));
  37. }
  38.  
  39. if (text.Contains(TerminationCharacter))
  40. {
  41. throw new ArgumentException("Input contains termination character", nameof(text));
  42. }
  43.  
  44. switch (algorithm)
  45. {
  46. case SuffixTreeAlgorithm.UkkonenLinear: // O(n)
  47. return new SuffixTreeUkkonenLinear(text);
  48.  
  49. default:
  50. throw new NotImplementedException($"No implementation for algoritm {algorithm}");
  51. }
  52. }
  53. #endregion
  54. }
  55.  
  56. public enum SuffixTreeAlgorithm
  57. {
  58. NaiveUsingTrie,
  59. Naive,
  60. UkkonenCubic,
  61. UkkonenQuadratic,
  62. UkkonenLinear,
  63. UkkonenFast,
  64. Weiner,
  65. McCreight
  66. }
  67.  
  68. /// <summary>
  69. /// Ukkonen linear time O(n) algorithm
  70. /// As described in the book by D. Gusfield, Algorithms on Strings, Trees and Sequences
  71. /// </summary>
  72. public class SuffixTreeUkkonenLinear : SuffixTree
  73. {
  74. #region Fields
  75. private static readonly int currentPosition = int.MinValue;
  76.  
  77. private readonly Node root;
  78. private readonly string text;
  79. #endregion
  80.  
  81. #region Constructor
  82. public SuffixTreeUkkonenLinear(string inputText)
  83. {
  84. text = inputText + TerminationCharacter;
  85. root = Build(text);
  86. }
  87. #endregion
  88.  
  89. #region Api
  90. public override bool IsMatch(string substring)
  91. {
  92. return Match(substring).Any();
  93. }
  94.  
  95. public override IEnumerable<int> Match(string substring)
  96. {
  97. var node = Navigate(root, 0, substring.Length, substring, text, false);
  98. if (!node.isFound)
  99. {
  100. yield break;
  101. }
  102.  
  103. var stack = new Stack<Node>();
  104. if (node.childIndex < 0)
  105. {
  106. stack.Push(node.parent);
  107. }
  108. else
  109. {
  110. stack.Push(node.parent.children[node.childIndex]);
  111. }
  112.  
  113. while (stack.Count > 0)
  114. {
  115. var current = stack.Pop();
  116.  
  117. if (HasChildren(current))
  118. {
  119. foreach (var child in current.children)
  120. {
  121. stack.Push(child);
  122. }
  123. }
  124. else
  125. {
  126. yield return current.pos;
  127. }
  128. }
  129. }
  130. #endregion
  131.  
  132. #region Methods
  133. private static Location Navigate(Node parent, int from, int to, string substring, string text, bool useSkipCount)
  134. {
  135. var node = parent;
  136.  
  137. if (from == to)
  138. {
  139. return new Location(true, node, from, 0, -1);
  140. }
  141.  
  142. // Navigate to the end of substring
  143. var k = from;
  144. while (true)
  145. {
  146. var childIndex = FindChild(substring[k], node, text);
  147. if (childIndex < 0)
  148. {
  149. return new Location(false, node, k, 0, -1);
  150. }
  151.  
  152. var child = node.children[childIndex];
  153. var m = 0;
  154.  
  155. if (useSkipCount)
  156. {
  157. var skip = Math.Min(to - k, end(child, to + 1) - child.start);
  158. m += skip;
  159. k += skip;
  160. }
  161. else
  162. {
  163. while (child.start + m < end(child, to + 1) &&
  164. k < to &&
  165. text[child.start + m] == substring[k])
  166. {
  167. ++m;
  168. ++k;
  169. }
  170. }
  171.  
  172. if (k == to)
  173. {
  174. return new Location(true, node, k, m, childIndex);
  175. }
  176. else if (child.start + m == end(child, to + 1))
  177. {
  178. if (!HasChildren(child))
  179. {
  180. return new Location(false, node, k, m, childIndex);
  181. }
  182. node = child;
  183. }
  184. else
  185. {
  186. return new Location(false, node, k, m, childIndex);
  187. }
  188. }
  189. }
  190.  
  191. private static int end(Node node, int pos)
  192. {
  193. if (node.end == currentPosition)
  194. return pos - 1;
  195.  
  196. return node.end;
  197. }
  198.  
  199. private static int FindChild(char c, Node node, string text)
  200. {
  201. if (node.children == null)
  202. {
  203. return -1;
  204. }
  205.  
  206. for (int i = 0; i < node.children.Count; ++i)
  207. {
  208. var child = node.children[i];
  209. if (text[child.start] == c)
  210. {
  211. return i;
  212. }
  213. }
  214.  
  215. return -1;
  216. }
  217.  
  218. private static bool HasChildren(Node node)
  219. {
  220. return node.children != null;
  221. }
  222. #endregion
  223.  
  224. #region Construction
  225. private Node Build(string text)
  226. {
  227. var builder = new UkkonenBuilder(text);
  228. return builder.Build();
  229. }
  230.  
  231. private class UkkonenBuilder
  232. {
  233. private readonly string text;
  234. private readonly Node root;
  235.  
  236. private Node activeLeaf;
  237. private Node nodeThatRequireSuffixLink;
  238.  
  239. public UkkonenBuilder(string text)
  240. {
  241. this.text = text;
  242. this.root = new Node
  243. {
  244. children = new List<Node>()
  245. };
  246. }
  247.  
  248. public Node Build()
  249. {
  250. activeLeaf = ConstructT(0);
  251. var next = new Next(root, 1, 1, 0);
  252.  
  253. for (int i = 1; i < text.Length; ++i)
  254. {
  255. nodeThatRequireSuffixLink = null;
  256.  
  257. // Phase i+1
  258.  
  259. // So the first extension of any phase is special and only takes constant time since the algorithm
  260. // has a pointer to the end of the current full string
  261. // I.e. when j = 0
  262. ApplyRule1(activeLeaf, i + 1);
  263.  
  264. while (next.j < i)
  265. {
  266. var location = Navigate(next.node, next.from, i, text, text, true);
  267. next = ApplyRule(location, next.j, i);
  268.  
  269. if (next.rule == 3)
  270. {
  271. // Rule 3 stops current phase
  272. break;
  273. }
  274. }
  275.  
  276. // Do not put TerminationCharacter to the tree
  277. if (i < text.Length - 1)
  278. {
  279. // Extend empty suffix, by putting the next character to the tree
  280. ConstructT(i);
  281. }
  282. }
  283.  
  284. foreach (var node in Visit(root))
  285. {
  286. if (node.end == currentPosition)
  287. {
  288. node.end = text.Length;
  289. }
  290. }
  291.  
  292. return root;
  293. }
  294.  
  295. private struct Next
  296. {
  297. public Node node;
  298. public int j;
  299. public int from;
  300. public int rule;
  301.  
  302. public Next(Node node, int j, int from, int rule)
  303. {
  304. this.node = node;
  305. this.j = j;
  306. this.from = from;
  307. this.rule = rule;
  308. }
  309. }
  310.  
  311. private Next ApplyRule(Location location, int j, int i)
  312. {
  313. if (location.childIndex == -1)
  314. {
  315. ApplyRule2(location.parent, -1, location.offsetInEdge, location.offsetInString, j);
  316.  
  317. if (location.parent.suffixLink == null)
  318. {
  319. return new Next(root, j + 1, j + 1, 2);
  320. }
  321. else
  322. {
  323. return new Next(location.parent.suffixLink, j + 1, i, 2);
  324. }
  325. }
  326.  
  327. if (location.offsetInString != i)
  328. {
  329. throw new Exception("What?");
  330. }
  331.  
  332. var child = location.parent.children[location.childIndex];
  333. if (child.start + location.offsetInEdge == end(child, i + 1))
  334. {
  335. if (HasChildren(child))
  336. {
  337. if (FindChild(child.children, text[i]) >= 0)
  338. {
  339. ApplyRule3();
  340.  
  341. return new Next(location.parent,
  342. j,
  343. i - location.offsetInEdge,
  344. 3);
  345. }
  346. else
  347. {
  348. ApplyRule2(child, -1, 0, location.offsetInString, j);
  349.  
  350. if (child.suffixLink != null)
  351. {
  352. return new Next(child.suffixLink.parent,
  353. j + 1,
  354. i - (end(child.suffixLink, i + 1) - child.suffixLink.start),
  355. 2);
  356. }
  357. else if (location.parent.suffixLink == null)
  358. {
  359. return new Next(root, j + 1, j + 1, 2);
  360. }
  361. else
  362. {
  363. return new Next(location.parent.suffixLink,
  364. j + 1,
  365. i - (end(child, i + 1) - child.start),
  366. 2);
  367. }
  368. }
  369. }
  370. else
  371. {
  372. ApplyRule1(child, i + 1);
  373.  
  374. if (location.parent.suffixLink == null)
  375. {
  376. return new Next(root, j + 1, j + 1, 1);
  377. }
  378. else
  379. {
  380. return new Next(location.parent.suffixLink,
  381. j + 1,
  382. i - location.offsetInEdge,
  383. 1);
  384. }
  385. }
  386. }
  387. else
  388. {
  389. if (text[child.start + location.offsetInEdge] == text[i])
  390. {
  391. ApplyRule3();
  392.  
  393. return new Next(location.parent,
  394. j,
  395. i - location.offsetInEdge,
  396. 3);
  397. }
  398. else
  399. {
  400. ApplyRule2(location.parent, location.childIndex, location.offsetInEdge, location.offsetInString, j);
  401.  
  402. if (location.parent.suffixLink == null)
  403. {
  404. if (location.parent.parent != null && location.parent.parent.suffixLink != null)
  405. {
  406. return new Next(location.parent.parent.suffixLink,
  407. j + 1,
  408. i - location.offsetInEdge - (end(location.parent, i + 1) - location.parent.start),
  409. 2);
  410. }
  411. else
  412. {
  413. return new Next(root, j + 1, j + 1, 2);
  414. }
  415. }
  416. else
  417. {
  418. return new Next(location.parent.suffixLink,
  419. j + 1,
  420. i - location.offsetInEdge,
  421. 2);
  422. }
  423. }
  424. }
  425. }
  426.  
  427. private Node ConstructT(int t)
  428. {
  429. var childIndex = FindChild(root.children, text[t]);
  430. if (childIndex >= 0)
  431. {
  432. return root.children[childIndex];
  433. }
  434.  
  435. var newNode = new Node
  436. {
  437. start = t,
  438. end = currentPosition,
  439. pos = t,
  440. parent = root
  441. };
  442.  
  443. root.children.Add(newNode);
  444. return newNode;
  445. }
  446.  
  447. private int FindChild(IList<Node> children, char c)
  448. {
  449. for (int i = 0; i < children.Count; ++i)
  450. {
  451. if (text[children[i].start] == c)
  452. {
  453. return i;
  454. }
  455. }
  456. return -1;
  457. }
  458.  
  459. private void ApplyRule1(Node leaf, int newEnd)
  460. {
  461. //Rule 1. Path ends at a leaf. Extend implicitly
  462. }
  463.  
  464. private void ApplyRule2(Node parent, int childIndex, int m, int k, int pos)
  465. {
  466. var newParent = default(Node);
  467. if (childIndex >= 0)
  468. {
  469. //Rule 2. Split label and add new leaf
  470. var child = parent.children[childIndex];
  471.  
  472. // 1) replace child with internal node
  473. newParent = new Node
  474. {
  475. start = child.start,
  476. end = child.start + m,
  477. parent = parent,
  478.  
  479. suffixLink = null,
  480. children = new List<Node>()
  481. };
  482.  
  483. parent.children[childIndex] = newParent;
  484.  
  485. // 2) adjust start position of the child and add it to the new internal node as a child
  486. child.start += m;
  487. child.parent = newParent;
  488. newParent.children.Add(child);
  489.  
  490. // Assign suffix link
  491. {
  492. // Corollary 6.1.1. In Ukkonen’s algorithm, any newly created internal node will have a
  493. // suffix link from it by the end of the next extension.
  494. //
  495. // I.e. if an edge is split, which means a new node was created, and if that was not the first
  496. // node created during the current phase, then connect the previously inserted node and the new node
  497. // through a suffix link.
  498. if (nodeThatRequireSuffixLink != null)
  499. {
  500. if (nodeThatRequireSuffixLink.suffixLink != null)
  501. {
  502. throw new Exception("What?");
  503. }
  504. nodeThatRequireSuffixLink.suffixLink = newParent;
  505. }
  506. nodeThatRequireSuffixLink = newParent;
  507. }
  508. }
  509. else
  510. {
  511. newParent = parent;
  512. }
  513.  
  514. // 3) add the rest of the suffix as a new child
  515. if (newParent.children == null)
  516. {
  517. newParent.children = new List<Node>();
  518. }
  519.  
  520. newParent.children.Add(new Node
  521. {
  522. start = k,
  523. end = currentPosition,
  524. parent = newParent,
  525.  
  526. pos = pos
  527. });
  528. }
  529.  
  530. private void ApplyRule3()
  531. {
  532. //Rule 3. Suffix is already in the tree
  533. // Do nothing
  534. }
  535. }
  536. #endregion
  537.  
  538. #region Visualization
  539.  
  540. public override string ToDotNotation()
  541. {
  542. var dotText = new StringBuilder();
  543. dotText.AppendLine("digraph g {");
  544. dotText.AppendLine("node[shape = circle];");
  545.  
  546. var labels = new Dictionary<Node, int>();
  547.  
  548. // Nodes
  549. foreach (var node in Visit(root))
  550. {
  551. int index = GetLabelIndex(labels, node);
  552.  
  553. if (!HasChildren(node))
  554. {
  555. dotText.AppendLine($"node{index} [label=\"{node.pos}\"]");
  556. }
  557. else
  558. {
  559. dotText.AppendLine($"node{index} [label=\"\"]");
  560.  
  561. foreach (var child in node.children.OrderBy(c => text[c.start]))
  562. {
  563. int childIndex = GetLabelIndex(labels, child);
  564. dotText.AppendLine($"node{index} -> node{childIndex} [label=\"{text.Substring(child.start, end(child, text.Length + 1) - child.start)}\"]");
  565. }
  566. }
  567.  
  568. if (node.suffixLink != null && node.suffixLink != root)
  569. {
  570. int suffixIndex = GetLabelIndex(labels, node.suffixLink);
  571. dotText.AppendLine($"node{index} -> node{suffixIndex} [style=\"dashed\", constraint=false, color=silver]");
  572. }
  573. }
  574.  
  575. dotText.AppendLine("}");
  576. return dotText.ToString();
  577. }
  578.  
  579. private static int GetLabelIndex(Dictionary<Node, int> labels, Node node)
  580. {
  581. int index;
  582. if (!labels.TryGetValue(node, out index))
  583. {
  584. index = labels.Count + 1;
  585. labels.Add(node, index);
  586. }
  587.  
  588. return index;
  589. }
  590.  
  591. private static IEnumerable<Node> Visit(Node root)
  592. {
  593. var stack = new Stack<Node>();
  594. stack.Push(root);
  595.  
  596. while (stack.Count > 0)
  597. {
  598. var current = stack.Pop();
  599.  
  600. if (HasChildren(current))
  601. {
  602. foreach (var child in current.children)
  603. {
  604. stack.Push(child);
  605. }
  606. }
  607.  
  608. yield return current;
  609. }
  610. }
  611.  
  612. #endregion
  613.  
  614. #region Types
  615. class Node
  616. {
  617. public int start;
  618. public int end;
  619. public Node parent;
  620.  
  621. // Leaf
  622. public int pos;
  623.  
  624. // Internal
  625. public Node suffixLink;
  626. public IList<Node> children;
  627. }
  628.  
  629. struct Location
  630. {
  631. public bool isFound;
  632. public Node parent;
  633. public int offsetInString;
  634. public int offsetInEdge;
  635. public int childIndex;
  636.  
  637. public Location(bool isFound, Node parent, int offsetInString, int offsetInEdge, int childIndex)
  638. {
  639. this.isFound = isFound;
  640. this.parent = parent;
  641. this.offsetInString = offsetInString;
  642. this.offsetInEdge = offsetInEdge;
  643. this.childIndex = childIndex;
  644. }
  645. }
  646. #endregion
  647. }
  648.  
  649. public class Test
  650. {
  651. public static void Main()
  652. {
  653. var tree = SuffixTree.Build(Console.ReadLine(), SuffixTreeAlgorithm.UkkonenLinear);
  654. Console.WriteLine("Use http://w...content-available-to-author-only...z.com to render the tree:");
  655. Console.WriteLine(tree.ToDotNotation());
  656. }
  657. }
  658. }
Success #stdin #stdout 0.03s 17644KB
stdin
cacao
stdout
Use http://w...content-available-to-author-only...z.com to render the tree:
digraph g {
node[shape = circle];
node1 [label=""]
node1 -> node2 [label="a"]
node1 -> node3 [label="ca"]
node1 -> node4 [label="o$"]
node4 [label="4"]
node2 [label=""]
node2 -> node5 [label="cao$"]
node2 -> node6 [label="o$"]
node6 [label="3"]
node5 [label="1"]
node3 [label=""]
node3 -> node7 [label="cao$"]
node3 -> node8 [label="o$"]
node3 -> node2 [style="dashed", constraint=false, color=silver]
node8 [label="2"]
node7 [label="0"]
}