In any case, after a significant improvement, I managed to write the TsvCompressed output, which seems to do the job (you still need to set the configuration properties of the hadoop job system, i.e. set the complex to true and set the codec to something reasonable or default crappy deflation )
import com.twitter.scalding._ import cascading.tuple.Fields import cascading.scheme.local import cascading.scheme.hadoop.{TextLine, TextDelimited} import cascading.scheme.Scheme import org.apache.hadoop.mapred.{OutputCollector, RecordReader, JobConf} case class TsvCompressed(p: String) extends FixedPathSource(p) with DelimitedSchemeCompressed trait DelimitedSchemeCompressed extends Source { val types: Array[Class[_]] = null override def localScheme = new local.TextDelimited(Fields.ALL, false, false, "\t", types) override def hdfsScheme = { val temp = new TextDelimited(Fields.ALL, false, false, "\t", types) temp.setSinkCompression(TextLine.Compress.ENABLE) temp.asInstanceOf[Scheme[JobConf,RecordReader[_,_],OutputCollector[_,_],_,_]] } }
samthebest May 29 '14 at 17:42 2014-05-29 17:42
source share