import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import SparkContext._
import java.io.File
import java.net.URI
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Scanner;
object Main {
val appName = "SparkSpec"
val jars
= List(SparkContext.
jarOfObject(this).
get) println(jars)
val conf = new SparkConf().setAppName(appName).setJars(jars)
val sc = new SparkContext(conf)
if (args.length < 2) {
println(args.mkString(","))
println("ERROR. Please, specify input and output directories.")
} else {
val inputDir = args(0)
val outputDir = args(1)
println("Input directory: " + inputDir)
println("Output directory: " + outputDir)
run(sc, inputDir, outputDir)
}
}
val GLUSTERFS_MOUNT_PATH = "/mnt/root"
val FILENAME_PATTERN = "([0-9]{5})([dijkw])([0-9]{4})\\.txt\\.gz".r
val LINE_PATTERN = "([0-9]{4} [0-9]{2} [0-9]{2} [0-9]{2} [0-9]{2})(.*)".r
val VARIABLE_NAMES
= Array("d",
"i",
"j",
"k",
"w")
def toGlusterfsPath
(path
: String): String = path.
replace(GLUSTERFS_MOUNT_PATH,
"") def toGlusterfsPath
(file
: File): String = toGlusterfsPath
(file.
getAbsolutePath())
for (i <- 0 to array.length-1 ){
result(i)= (fileNum, s, array(i) );
}
result
}
def parseString
(s
: String) = { var scanner : Scanner = new Scanner(s);
if ( scanner.hasNextFloat() ){
string+=scanner.nextFloat();
}
while(scanner.hasNextFloat()){
string+= ", " + scanner.nextFloat();
}
string+="]"
string
}
def run
(sc
: SparkContext, inputDir
: String, outputDir
: String) {
sc
.wholeTextFiles(toGlusterfsPath(inputDir + "/*.gz"))
//glusterfs:/datasets/ndbc-small/41013i2012.txt.gz textfromfile
.
map(it
=> ((new File(new File(it._1
).
toURI().
getPath()).
getName()), it._2
) ) //45005w2012.txt.gz textfromfile
.map( it => (FILENAME_PATTERN.pattern.matcher(it._1), it) )
.filter(it => it._1.matches())
.map( it => ( (it._1.group(1) , it._1.group(2)), it._2._2) )
// (45005, w) textfromfile
.map( it => (it._1, it._2.split("\\n")) )
// (45005, w) lines
.flatMap( it => makeLines(it._1._1, it._1._2, it._2) )
// 45005 w line
.map( it => (LINE_PATTERN.pattern.matcher(it._3), it) )
// matcher ( 45005 w line )
.filter( it => it._1.matches() )
.map( it => (( it._2._1, it._1.group(1), it._2._2 ),( it._1.group(2))))
// ( 45005, date, w) ( 38473 9 8453 2 98)
.groupByKey()
// ( 45005, date ,w) [ (38473 9 8453 2 98)
//(38473 9 8453 2 98)(38473 9 8453 2 98) ...
.map(it => ( (it._1._1, it._1._2), (it._1._3, it._2.iterator.next())) )
//(45005, date)( w, 495 34957 349857 )
.groupByKey()
// ( 45005, date) [ (i, 38473 9 8453 2 98) (j, 38473 9 8453 2 98) (k, 38473 9 8453 2 98)
//(w, 38473 9 8453 2 98) (d, 38473 9 8453 2 98) ]
.filter( it => it._2.size == 5 )
// ( 45005, date) [ (i, 38473 9 8453 2 98) (j, 38473 9 8453 2 98) (k, 38473 9 8453 2 98)
//(w, 38473 9 8453 2 98) (d, 38473 9 8453 2 98) ]
.map( it => (it._1._2, it._2.toMap) )
// date Map: [ (w, 38473 9 8453 2 98) (w, 38473 9 8453 2 98) (w, 38473 9 8453 2 98)
//(w, 38473 9 8453 2 98) (w, 38473 9 8453 2 98) ]
.sortByKey()
//sorted by date
.map( it => (it._1, it._2.get("i"), it._2.get("j"), it._2.get("k"), it._2.get("w"), it._2.get("d") ) )
//упорядочены по ijkwd'
.map(set => (set._1, set._2.getOrElse("[]"), set._3.getOrElse("[]"), set._4.getOrElse("[]"),
set._5.getOrElse("[]"),set._6.getOrElse("[]")) )
.map( set => (set._1, parseString(set._2), parseString(set._3), parseString(set._4), parseString(set._5), parseString(set._6)) )
.map( set => set._1 + "\t[" + "i=" + set._2 + "," + "j=" + set._3 + "," + "k=" + set._4 + "," +
"w=" + set._5 + "," + "d=" + set._6 + "]")
.saveAsTextFile(outputDir)
}
}
aW1wb3J0IG9yZy5hcGFjaGUuc3BhcmsuU3BhcmtDb250ZXh0CmltcG9ydCBvcmcuYXBhY2hlLnNwYXJrLlNwYXJrQ29uZgppbXBvcnQgU3BhcmtDb250ZXh0Ll8KaW1wb3J0IGphdmEuaW8uRmlsZQppbXBvcnQgamF2YS5uZXQuVVJJCmltcG9ydCBqYXZhLnV0aWwucmVnZXguTWF0Y2hlcjsKaW1wb3J0IGphdmEudXRpbC5yZWdleC5QYXR0ZXJuOwppbXBvcnQgamF2YS51dGlsLlNjYW5uZXI7CgpvYmplY3QgTWFpbiB7CgoJZGVmIG1haW4oYXJnczogQXJyYXlbU3RyaW5nXSkgewoJCXZhbCBhcHBOYW1lID0gIlNwYXJrU3BlYyIKCQl2YWwgamFycyA9IExpc3QoU3BhcmtDb250ZXh0Lmphck9mT2JqZWN0KHRoaXMpLmdldCkKCQlwcmludGxuKGphcnMpCgkJdmFsIGNvbmYgPSBuZXcgU3BhcmtDb25mKCkuc2V0QXBwTmFtZShhcHBOYW1lKS5zZXRKYXJzKGphcnMpCgkJdmFsIHNjID0gbmV3IFNwYXJrQ29udGV4dChjb25mKQoJCWlmIChhcmdzLmxlbmd0aCA8IDIpIHsKCQkJcHJpbnRsbihhcmdzLm1rU3RyaW5nKCIsIikpCgkJCXByaW50bG4oIkVSUk9SLiBQbGVhc2UsIHNwZWNpZnkgaW5wdXQgYW5kIG91dHB1dCBkaXJlY3Rvcmllcy4iKQoJCX0gZWxzZSB7CgkJCXZhbCBpbnB1dERpciA9IGFyZ3MoMCkKCQkJdmFsIG91dHB1dERpciA9IGFyZ3MoMSkKCQkJcHJpbnRsbigiSW5wdXQgZGlyZWN0b3J5OiAiICsgaW5wdXREaXIpCgkJCXByaW50bG4oIk91dHB1dCBkaXJlY3Rvcnk6ICIgKyBvdXRwdXREaXIpCgkJCXJ1bihzYywgaW5wdXREaXIsIG91dHB1dERpcikKCQl9Cgl9CgoJdmFsIEdMVVNURVJGU19NT1VOVF9QQVRIID0gIi9tbnQvcm9vdCIKCXZhbCBGSUxFTkFNRV9QQVRURVJOID0gIihbMC05XXs1fSkoW2Rpamt3XSkoWzAtOV17NH0pXFwudHh0XFwuZ3oiLnIKCXZhbCBMSU5FX1BBVFRFUk4gPSAiKFswLTldezR9IFswLTldezJ9IFswLTldezJ9IFswLTldezJ9IFswLTldezJ9KSguKikiLnIKCXZhbCBWQVJJQUJMRV9OQU1FUyA9IEFycmF5KCJkIiwgImkiLCAiaiIsICJrIiwgInciKQoKCWRlZiB0b0dsdXN0ZXJmc1BhdGgocGF0aDogU3RyaW5nKTogU3RyaW5nID0gcGF0aC5yZXBsYWNlKEdMVVNURVJGU19NT1VOVF9QQVRILCAiIikKCWRlZiB0b0dsdXN0ZXJmc1BhdGgoZmlsZTogRmlsZSk6IFN0cmluZyA9IHRvR2x1c3RlcmZzUGF0aChmaWxlLmdldEFic29sdXRlUGF0aCgpKQoKCWRlZiBtYWtlTGluZXMoIGZpbGVOdW06IFN0cmluZywgczogU3RyaW5nLCBhcnJheTogQXJyYXlbU3RyaW5nXSApID0gewoJCXZhciByZXN1bHQ6IEFycmF5WyhTdHJpbmcsIFN0cmluZywgU3RyaW5nKV0gPSBuZXcgQXJyYXlbKFN0cmluZywgU3RyaW5nLCBTdHJpbmcpXShhcnJheS5sZW5ndGgpOwoJCWZvciAoaSA8LSAwIHRvIGFycmF5Lmxlbmd0aC0xICl7IAoJCQlyZXN1bHQoaSk9IChmaWxlTnVtLCBzLCBhcnJheShpKSApOwoJCX0KCQlyZXN1bHQKCX0KCgoJZGVmIHBhcnNlU3RyaW5nKHM6IFN0cmluZykgPSB7CgkJdmFyIHNjYW5uZXIgOiBTY2FubmVyID0gbmV3IFNjYW5uZXIocyk7CgkJdmFyIHN0cmluZyA6IFN0cmluZyA9IG5ldyBTdHJpbmcoIlsiKTsKCQlpZiAoIHNjYW5uZXIuaGFzTmV4dEZsb2F0KCkgKXsKCQkJc3RyaW5nKz1zY2FubmVyLm5leHRGbG9hdCgpOwkJCgkJfQkJCQkKCQl3aGlsZShzY2FubmVyLmhhc05leHRGbG9hdCgpKXsKCQkJc3RyaW5nKz0gIiwgIiArIHNjYW5uZXIubmV4dEZsb2F0KCk7CQkJCQkKCQl9CgkJc3RyaW5nKz0iXSIKCQlzdHJpbmcKCX0KCglkZWYgcnVuKHNjOiBTcGFya0NvbnRleHQsIGlucHV0RGlyOiBTdHJpbmcsIG91dHB1dERpcjogU3RyaW5nKSB7CgkJCgkJc2MKCQkud2hvbGVUZXh0RmlsZXModG9HbHVzdGVyZnNQYXRoKGlucHV0RGlyICsgIi8qLmd6IikpCgkJLy9nbHVzdGVyZnM6L2RhdGFzZXRzL25kYmMtc21hbGwvNDEwMTNpMjAxMi50eHQuZ3ogICAgICAgICAgICB0ZXh0ZnJvbWZpbGUKCQkubWFwKGl0ID0+ICgobmV3IEZpbGUobmV3IEZpbGUoaXQuXzEpLnRvVVJJKCkuZ2V0UGF0aCgpKS5nZXROYW1lKCkpLCBpdC5fMikgICkKCQkvLzQ1MDA1dzIwMTIudHh0Lmd6ICAgICAgICAgIHRleHRmcm9tZmlsZQoJCS5tYXAoIGl0ID0+IChGSUxFTkFNRV9QQVRURVJOLnBhdHRlcm4ubWF0Y2hlcihpdC5fMSksIGl0KSApCQkJCQoJCS5maWx0ZXIoaXQgPT4gaXQuXzEubWF0Y2hlcygpKQoJCS5tYXAoIGl0ID0+ICggKGl0Ll8xLmdyb3VwKDEpICwgaXQuXzEuZ3JvdXAoMikpLCBpdC5fMi5fMikgKQoJCS8vICAgICg0NTAwNSwgdykgdGV4dGZyb21maWxlCgkJLm1hcCggaXQgPT4gKGl0Ll8xLCBpdC5fMi5zcGxpdCgiXFxuIikpICkKCQkvLyAgICAgICg0NTAwNSwgdykgICAgICAgICBsaW5lcwoJCS5mbGF0TWFwKCBpdCA9PiBtYWtlTGluZXMoaXQuXzEuXzEsIGl0Ll8xLl8yLCBpdC5fMikgKQoJCS8vICAgICAgNDUwMDUgICAgICAgdyAgICAgICAgIGxpbmUKCQkubWFwKCBpdCA9PiAoTElORV9QQVRURVJOLnBhdHRlcm4ubWF0Y2hlcihpdC5fMyksIGl0KSApCgkJLy8gICBtYXRjaGVyICggNDUwMDUgdyAgbGluZSApCQoJCS5maWx0ZXIoIGl0ID0+IGl0Ll8xLm1hdGNoZXMoKSApCQoJCS5tYXAoIGl0ID0+ICgoIGl0Ll8yLl8xLCAgaXQuXzEuZ3JvdXAoMSksIGl0Ll8yLl8yICksKCBpdC5fMS5ncm91cCgyKSkpKQoJCS8vICggNDUwMDUsIGRhdGUsIHcpICggIDM4NDczIDkgODQ1MyAyIDk4KSAKCQkuZ3JvdXBCeUtleSgpCgkJCS8vICggNDUwMDUsIGRhdGUgLHcpICBbICgzODQ3MyA5IDg0NTMgMiA5OCkgCgkJCQkvLygzODQ3MyA5IDg0NTMgMiA5OCkoMzg0NzMgOSA4NDUzIDIgOTgpIC4uLgoJCS5tYXAoaXQgPT4gKCAoaXQuXzEuXzEsIGl0Ll8xLl8yKSwgKGl0Ll8xLl8zLCBpdC5fMi5pdGVyYXRvci5uZXh0KCkpKSApCgkJLy8oNDUwMDUsIGRhdGUpKCB3LCA0OTUgMzQ5NTcgMzQ5ODU3ICApCQkKCQkuZ3JvdXBCeUtleSgpCQoJCS8vICggNDUwMDUsIGRhdGUpICBbIChpLCAgMzg0NzMgOSA4NDUzIDIgOTgpIChqLCAgMzg0NzMgOSA4NDUzIDIgOTgpIChrLCAgMzg0NzMgOSA4NDUzIDIgOTgpIAoJCQkvLyh3LCAgMzg0NzMgOSA4NDUzIDIgOTgpIChkLCAgMzg0NzMgOSA4NDUzIDIgOTgpIF0KCQkuZmlsdGVyKCBpdCA9PiBpdC5fMi5zaXplID09IDUgKQoJCS8vICggNDUwMDUsIGRhdGUpICBbIChpLCAgMzg0NzMgOSA4NDUzIDIgOTgpIChqLCAgMzg0NzMgOSA4NDUzIDIgOTgpIChrLCAgMzg0NzMgOSA4NDUzIDIgOTgpIAoJCQkvLyh3LCAgMzg0NzMgOSA4NDUzIDIgOTgpIChkLCAgMzg0NzMgOSA4NDUzIDIgOTgpIF0KCgkJLm1hcCggaXQgPT4gKGl0Ll8xLl8yLCBpdC5fMi50b01hcCkgKQoJCS8vIGRhdGUgIE1hcDogWyAodywgIDM4NDczIDkgODQ1MyAyIDk4KSAodywgIDM4NDczIDkgODQ1MyAyIDk4KSAodywgIDM4NDczIDkgODQ1MyAyIDk4KSAKCQkJLy8odywgIDM4NDczIDkgODQ1MyAyIDk4KSAodywgIDM4NDczIDkgODQ1MyAyIDk4KSBdCgkJCgkJLnNvcnRCeUtleSgpCgkJLy9zb3J0ZWQgYnkgZGF0ZQoKCQkubWFwKCBpdCA9PiAoaXQuXzEsIGl0Ll8yLmdldCgiaSIpLCBpdC5fMi5nZXQoImoiKSwgaXQuXzIuZ2V0KCJrIiksIGl0Ll8yLmdldCgidyIpLCBpdC5fMi5nZXQoImQiKSApICkKCQkvL9GD0L/QvtGA0Y/QtNC+0YfQtdC90Ysg0L/QviBpamt3ZCcKCQkubWFwKHNldCA9PiAoc2V0Ll8xLCBzZXQuXzIuZ2V0T3JFbHNlKCJbXSIpLCBzZXQuXzMuZ2V0T3JFbHNlKCJbXSIpLCBzZXQuXzQuZ2V0T3JFbHNlKCJbXSIpLAoJCQlzZXQuXzUuZ2V0T3JFbHNlKCJbXSIpLHNldC5fNi5nZXRPckVsc2UoIltdIikpICApCgkJLm1hcCggc2V0ID0+IChzZXQuXzEsIHBhcnNlU3RyaW5nKHNldC5fMiksIHBhcnNlU3RyaW5nKHNldC5fMyksIHBhcnNlU3RyaW5nKHNldC5fNCksIHBhcnNlU3RyaW5nKHNldC5fNSksIHBhcnNlU3RyaW5nKHNldC5fNikpICkKCQkubWFwKCBzZXQgPT4gc2V0Ll8xICsgIlx0WyIgKyAiaT0iICsgc2V0Ll8yICsgIiwiICsgImo9IiArIHNldC5fMyAgKyAiLCIgKyAiaz0iICsgc2V0Ll80ICsgIiwiICsgCgkJCQkJCQkJInc9IiArIHNldC5fNSArICIsIiAgKyAiZD0iICsgc2V0Ll82ICsgIl0iKQoJCQoJCS5zYXZlQXNUZXh0RmlsZShvdXRwdXREaXIpCgkJCgl9Cgp9Cg==