-- shredsolver.hs -- -- this is part two of the optional programming assignment for the -- Stanford Univerity Introduction to Artificial Intelligence class, -- which was to decode a message which was shredded into a random jumble of -- strips. -- -- The approach taken was to greedily combine strips from left to right -- until all strips have been used. That is, we choose an initial starting strip, -- then try all remaining strips, and choose the one that results in the -- highest score for the combined strips based on trigram frequency, then -- repeat until all strips used. -- We then repeat, this time choosing the starting strip from each of the candidates, -- and the result with the best score is our winner. -- -- To arrive at the correct result required some tweaking of the scoring. -- In my first attempt, the first 15 or so strips would be correct but the last 4 would be -- jumbled... I think this was because the last line is mostly blank, which -- upset the scoring, which was based on the sum of the log probability score -- for each row. Using only the first 6 out of 8 rows for scoring improved this -- significantly, however the correct answer was actually the third highest scoring result. -- The other two were almost correct but started with a column of spaces. -- Tweaking the scoring code to punish rows with a leading space ( assuming the text -- is left justified) brought the correct answer to the top of the list. -- -- source for trigram data is here: http://h...content-available-to-author-only...l.org/~cowan/trigrams -- the trigram data is read from std input -- -- PAC 28th January, 2012 import Data.Char (isLetter, toUpper, isSpace) import Data.List (sortBy, delete) import Data.Ord (comparing) import qualified Data.Map as M import System.IO import System.Environment import Control.Monad -- the code to break defaultShreddedMsg = "de| | f|Cl|nf|ed|au| i|ti| |ma|ha|or|nn|ou| S|on|nd|on\n\ \ry| |is|th|is| b|eo|as| | |f |wh| o|ic| t|, | |he|h \n\ \ab| |la|pr|od|ge|ob| m|an| |s |is|el|ti|ng|il|d |ua|c \n\ \he| |ea|of|ho| m| t|et|ha| | t|od|ds|e |ki| c|t |ng|br\n\ \wo|m,|to|yo|hi|ve|u | t|ob| |pr|d |s |us| s|ul|le|ol|e \n\ \ t|ca| t|wi| M|d |th|\"A|ma|l |he| p|at|ap|it|he|ti|le|er\n\ \ry|d |un|Th|\" |io|eo|n,|is| |bl|f |pu|Co|ic| o|he|at|mm\n\ \hi| | |in| | | t| | | | |ye| |ar| |s | | |. " -- define a datatype for our map of trigram frequencies for clarity type TrigramFrequencies = M.Map String Int -- a strip of our shredded message is simply a list of strings type Strip = [String] -- ShredState encapsulates the score for the current strip, and the remaining strips to be matched type ShredState = ( Double, Strip, [Strip] ) -- helper function for creating a ShredState makeShredState::Double -> Strip -> [Strip] -> ShredState makeShredState score strip strips = (score, strip, strips) -- define an infinitely long empty strip which will be useful for our initial state -- don't you love lazy evaluation? emptyStrip = [ [] | x <- [0..] ] -- function returns our initial state initialState strips = makeShredState 0 emptyStrip strips -- helpers for dealing with ShredStates getDecodedMessageStrip:: ShredState -> Strip getDecodedMessageStrip ( _, b, _ ) = b getRemainingStrips:: ShredState -> [Strip] getRemainingStrips ( _, _, c ) = c -- helper for sorting a list of ShredStates in descending order of score onScore:: ShredState -> ShredState -> Ordering onScore ( a,_,_) (b,_,_) | a > b = LT | a < b = GT | a == b = EQ -- splits a delimited string into a list of string representing the text found between the delimiters split :: Char -> String -> [String] split delim [] = [""] split delim (c:cs) | c == delim = "" : rest | otherwise = (c : head rest) : tail rest where rest = split delim cs --parse the shredded msg string into a more computer friendly form --firstParse converts the input to a list of lines, where each lines is a list of strings --representing the strings appearing on each strip that makes up the line makeStrips:: String -> [Strip] makeStrips str = [ map ( !! index ) (firstParse str) | index <- [ 0 .. (length $ head $ firstParse str) - 1] ] where firstParse = map (split '|') . lines join2Strips::Strip -> Strip ->Strip join2Strips = zipWith joinRowsOfStrip where joinRowsOfStrip r1 r2 = r1 ++ "|" ++ r2 -- just to check we parsed the strips correctly joinManyStrips::[Strip] -> Strip joinManyStrips [x] = x joinManyStrips (x:y:z) = joinManyStrips ((join2Strips x y):z) --parse a line from the trigams data file into a tuple of trigram and frequency per 10000 words parseTrigram:: String -> [ (String,Int) ] parseTrigram contents = [ lineToTuple( words x ) | x <- lines(contents), head x /= ';' ] where lineToTuple [ trigram, frequency ] = ( trigram, read frequency :: Int) -- loads the trigram frequency data into a Map for fast lookup -- source for trigram data is here: http://h...content-available-to-author-only...l.org/~cowan/trigrams -- we read it in from stdin -- returns an IO action yielding a Map containing a trigram and it's frequency per 10000 words loadTrigrams:: IO (TrigramFrequencies) loadTrigrams = fmap (M.fromList . parseTrigram) $ getContents -- preprocess our string for trigram lookup. Converts all non letters to # and the rest to uppercase preprocessText:: String -> String preprocessText text = map processChar text where processChar c = if isLetter c then toUpper c else '#' -- split a string into character trigrams splitIntoTrigrams:: String -> [String] splitIntoTrigrams str = [ take 3 $ drop x str | x <- [ 0 .. (length str - 3) ] ] -- lookup the trigram frequencies per 10000 words. If not found it will return 0 lookupTrigramFrequency:: TrigramFrequencies -> String -> Int lookupTrigramFrequency trigramFrequencies trigram = M.findWithDefault 0 trigram trigramFrequencies -- converts a letter frequency into a probability using laplace smoothing so we don't end up with -- trigrams that were not found borking our calculations (due to multiplying by 0) laplaceSmoothedProbability:: Int -> Int -> Int -> Double laplaceSmoothedProbability frequency totalCategories k = (fromIntegral(frequency) + fromIntegral(k))/(10000.0 + fromIntegral(k)*fromIntegral(totalCategories)) -- returns the log of the probability for a trigram. We use logs so we don't endup dealing with -- really small numbers caused by multiplying small numbers. We can just sum the logs instead. calcLogOfSmoothedProbabilityForTrigram:: TrigramFrequencies -> String -> Double calcLogOfSmoothedProbabilityForTrigram trigramFrequencies trigram = log $ laplaceSmoothedProbability score numClasses k where score = lookupTrigramFrequency trigramFrequencies $ preprocessText trigram numClasses = M.size trigramFrequencies k = 1 -- calculates the score for a string based on the probabilities of all the trigrams it contains scoreString:: TrigramFrequencies -> String -> Double scoreString trigramFrequencies = sum . map ( calcLogOfSmoothedProbabilityForTrigram trigramFrequencies ) . splitIntoTrigrams -- this function attempts to mark down strings that begin with spaces. We are making the assumption that the text is -- left justified. scoreStringPunishLeadingSpaces:: TrigramFrequencies -> String -> Double scoreStringPunishLeadingSpaces trigramFrequencies str = if (isSpace $ head str ) then ((scoreString trigramFrequencies str) + log 0.0000001) else (scoreString trigramFrequencies str) --takes two strips and joins them to produce a wider strip with the first strip to the left of the second strip combineStrips::Strip -> Strip ->Strip combineStrips = zipWith (++) -- take the product of the probabilities for each row of the strip in order to give the strip a score -- just use the first 6 rows as they are the complete lines (no partial lines), which seems to give a better result scoreStrip:: TrigramFrequencies -> Strip -> Double scoreStrip trigramFrequencies = sum . take 6. map (scoreStringPunishLeadingSpaces trigramFrequencies) -- expand the existing strip with each of the candidate strips and score it -- result is a list of ShredStates, each one step closer to our goal, scored from most likely to least likeley expandShredState::TrigramFrequencies -> ShredState -> [ShredState] expandShredState trigramFrequencies state = sortBy onScore $ zip3 score combined remainingStrips where combined = map (combineStrips (getDecodedMessageStrip state)) (getRemainingStrips state) score = map (scoreStrip trigramFrequencies) combined remainingStrips = [ delete x (getRemainingStrips state) | x <- (getRemainingStrips state) ] -- chooses the best expansion of the current state, ie: the one with the highest trigram score chooseBestExpansion:: TrigramFrequencies -> ShredState -> ShredState chooseBestExpansion trigramFrequencies state = head $ expandShredState trigramFrequencies state -- greedily combine strips, building on the most likely expansion at each level matchAll:: TrigramFrequencies -> ShredState -> ShredState matchAll trigramFrequencies ( score, existingStrip, [] ) = ( score, existingStrip, [] ) matchAll trigramFrequencies state = matchAll trigramFrequencies $ chooseBestExpansion trigramFrequencies state -- try starting with each strip and see what falls out startWithEachStripThenMatchAll trigramFrequencies strips = sortBy onScore $ [ matchAll trigramFrequencies state | state <- expandShredState trigramFrequencies $ initialState strips ] -- displays the string along with its score displayScoredMsg:: (Double, Strip, [Strip] ) -> IO () displayScoredMsg (score, strip, strips) = do putStr "score: " putStrLn $ show score putStrLn "" putStrLn $ unlines strip -- parse arguments, if no supplied use default parseArgs:: [String] -> String parseArgs args = if null args then defaultShreddedMsg else head args -- entry point main = do trigramFrequencies <- loadTrigrams args <- getArgs let msg = parseArgs args putStrLn "Shredded Message Solver (ai-class)\n" let strips = makeStrips msg putStrLn "shredded message:" putStrLn $ unlines $ joinManyStrips strips putStrLn "Top 3 results..." mapM_ displayScoredMsg $ take 3 $ startWithEachStripThenMatchAll trigramFrequencies strips