"""Lyndon.py Algorithms on strings and sequences based on Lyndon words. David Eppstein, October 2011.""" import unittest from Eratosthenes import MoebiusFunction def LengthLimitedLyndonWords(s,n): """Generate nonempty Lyndon words of length <= n over an s-symbol alphabet. The words are generated in lexicographic order, using an algorithm from J.-P. Duval, Theor. Comput. Sci. 1988, doi:10.1016/0304-3975(88)90113-2. As shown by Berstel and Pocchiola, it takes constant average time per generated word.""" w = [-1] # set up for first increment while w: w[-1] += 1 # increment the last non-z symbol yield w m = len(w) while len(w) < n: # repeat word to fill exactly n syms w.append(w[-m]) while w and w[-1] == s - 1: # delete trailing z's w.pop() def LyndonWordsWithLength(s,n): """Generate Lyndon words of length exactly n over an s-symbol alphabet. Since nearly half of the outputs of LengthLimitedLyndonWords(s,n) have the desired length, it again takes constant average time per word.""" if n == 0: yield [] # the empty word is a special case not handled by main alg for w in LengthLimitedLyndonWords(s,n): if len(w) == n: yield w def LyndonWords(s): """Generate all Lyndon words over an s-symbol alphabet. The generation order is by length, then lexicographic within each length.""" n = 0 while True: for w in LyndonWordsWithLength(s,n): yield w n += 1 def DeBruijnSequence(s,n): """Generate a De Bruijn sequence for words of length n over s symbols by concatenating together in lexicographic order the Lyndon words whose lengths divide n. The output length will be s^n. Because nearly half of the generated sequences will have length exactly n, the algorithm will take O(s^n/n) steps, and the bulk of the time will be spent in sequence concatenation.""" output = [] for w in LengthLimitedLyndonWords(s,n): if n % len(w) == 0: output += w return output def CountLyndonWords(s,n): """The number of length-n Lyndon words over s symbols.""" if n == 0: return 1 total = 0 for i in range(1,n+1): if n%i == 0: total += MoebiusFunction(n//i) * s**i return total//n def ChenFoxLyndonBreakpoints(s): """Find starting positions of Chen-Fox-Lyndon decomposition of s. The decomposition is a set of Lyndon words that start at 0 and continue until the next position. 0 itself is not output, but the final breakpoint at the end of s is. The argument s must be of a type that can be indexed (e.g. a list, tuple, or string). The algorithm follows Duval, J. Algorithms 1983, but uses 0-based indexing rather than Duval's choice of 1-based indexing.""" k = 0 while k < len(s): i,j = k,k+1 while j < len(s) and s[i] <= s[j]: i = (s[i] == s[j]) and i+1 or k # Python cond?yes:no syntax j += 1 while k < i+1: k += j-i yield k def ChenFoxLyndon(s): """Decompose s into Lyndon words according to the Chen-Fox-Lyndon theorem. The arguments are the same as for ChenFoxLyndonBreakpoints but the return values are subsequences of s rather than indices of breakpoints.""" old = 0 for k in ChenFoxLyndonBreakpoints(s): yield s[old:k] old = k def SmallestSuffix(s): """Find the suffix of s that is smallest in lexicographic order.""" for w in ChenFoxLyndon(s): pass return w def SmallestRotation(s): """Find the rotation of s that is smallest in lexicographic order. Duval 1983 describes how to modify his algorithm to do so but I think it's cleaner and more general to work from the ChenFoxLyndon output.""" prev,rep = None,0 for w in ChenFoxLyndon(s+s): if w == prev: rep += 1 else: prev,rep = w,1 if len(w)*rep == len(s): return w*rep raise Exception("Reached end of factorization with no shortest rotation") def isLyndonWord(s): """Is the given sequence a Lyndon word?""" if len(s) == 0: return True return next(ChenFoxLyndonBreakpoints(s)) == len(s) # If run standalone, perform unit tests class LyndonTest(unittest.TestCase): def testCount(self): """Test that we count Lyndon words correctly.""" for s in range(2,7): for n in range(1,6): self.assertEqual(CountLyndonWords(s,n), len(list(LyndonWordsWithLength(s,n)))) def testOrder(self): """Test that we generate Lyndon words in lexicographic order.""" for s in range(2,7): for n in range(1,6): prev = [] for x in LengthLimitedLyndonWords(s,n): self.assertTrue(prev < x) prev = list(x) def testSubsequence(self): """Test that words of length n-1 are a subsequence of length n.""" for s in range(2,7): for n in range(2,6): smaller = LengthLimitedLyndonWords(s,n-1) for x in LengthLimitedLyndonWords(s,n): if len(x) < n: self.assertEqual(x,next(smaller)) def testIsLyndon(self): """Test that the words we generate are Lyndon words.""" for s in range(2,7): for n in range(2,6): for w in LengthLimitedLyndonWords(s,n): self.assertEqual(isLyndonWord(w), True) def testNotLyndon(self): """Test that words that are not Lyndon words aren't claimed to be.""" nl = sum(1 for i in range(8**4) if isLyndonWord("%04o" % i)) self.assertEqual(nl,CountLyndonWords(8,4)) def testDeBruijn(self): """Test that the De Bruijn sequence is correct.""" for s in range(2,7): for n in range(1,6): db = DeBruijnSequence(s,n) self.assertEqual(len(db), s**n) db = db + db # duplicate so we can wrap easier subs = set(tuple(db[i:i+n]) for i in range(s**n)) self.assertEqual(len(subs), s**n) if __name__ == "__main__": unittest.main()