none
The magic number in gzip header is not correct - Exception while reading locally compressed file. RRS feed

  • Question

  • Hi! I tried read locally a compressed file using custom extractor, but on line stream.ReadLine() code throws exception "the magic number in gzip header is not correct. make sure you are passing in a gzip stream". What happens. For console application with similar code, everything works fine for the same files. What have i done incorrectly?

    Extractor code:

            public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output)
            {
                FastqObject fastqObject = null;
    
                if (output.Schema.Count == _columnCount)
                {
                    using (var decompress = new GZipStream(input.BaseStream, CompressionMode.Decompress))
                    using (StreamReader sr = new StreamReader(decompress))
                    {
                        
                        FastqObjectReader fastaReader = new FastqObjectReader(sr, _computeQualityControl, _qualityEncodingType);
                        do
                        {
                            fastqObject = fastaReader.GetNextSequenceFromStream(); //Here code throws exception on lone sr.ReadLine()
                            if (fastqObject != null)
                            {
                                String[] descriptionComponents = fastqObject.Description.Split('.');
                                Int32 id = Int32.Parse(descriptionComponents[1]);
    
                                output.Set<object>(output.Schema[0].Name, id);
                                output.Set<object>(output.Schema[1].Name, fastqObject.Description);
                                output.Set<object>(output.Schema[2].Name, fastqObject.Sequence);
                                output.Set<object>(output.Schema[3].Name, fastqObject.OptionalDescription);
                                output.Set<object>(output.Schema[4].Name, fastqObject.EncodedQuality);
                                if (_computeQualityControl)
                                    output.Set<object>(output.Schema[5].Name, fastqObject.DecodedQuality);
    
                                yield return output.AsReadOnly();
                            }
    
                        } while (fastqObject != null);
                    }
                }
                else
                {
                    throw new Exception("Incorrect numboer of colums");
                }
            }

    Code from Console application:

            public void Process()
            {
                String line_1;
                String line_2;
                FileStream destinationFile = new FileStream(_destination, FileMode.OpenOrCreate, FileAccess.Write);
                FileStream sourceFile_1 = new FileStream(_file_1, FileMode.Open, FileAccess.Read);
                FileStream sourceFile_2 = new FileStream(_file_2, FileMode.Open, FileAccess.Read);
    
                using (GZipStream distinationGzip = new GZipStream(destinationFile, CompressionLevel.Optimal), sourceFileGzip_1 = new GZipStream(sourceFile_1, CompressionMode.Decompress), sourceFileGzip_2 = new GZipStream(sourceFile_2, CompressionMode.Decompress))
                using (StreamWriter sw = new StreamWriter(distinationGzip))
                using (StreamReader sr = new StreamReader(sourceFileGzip_1), sr2 = new StreamReader(sourceFileGzip_2))
                {
                    int i = 0;
                    StringBuilder builder = new StringBuilder();
                    while (!String.IsNullOrEmpty(line_1 = sr.ReadLine()))
                    {
                        line_2 = sr2.ReadLine();
                        builder.Append(line_1);
                        builder.Append("|");
                        builder.Append(line_2);
                        builder.Append("|");
                        i++;
                        if (i == 4)
                        {
                            i = 0;
                            sw.WriteLine(builder.ToString());
                            builder.Clear();
                        }
                    }
                }
            }


    Wednesday, April 27, 2016 12:47 PM

All replies

  • How big is your gzip file? And how do you name the file in the EXTRACT statement?

    I see two possible reasons for the error:

    1. Your GZipped file's name ends in .gz. Since our extractor framework does the decompression for you, the data you see inside your extractor is already decompressed.
    2. Your GZipped file's name does not end in .gz. In that case the extractor framework will pass the document through. However, if the document is larger than 250MB and will be split, your code inside the extractor will see only parts of the file and thus raise an error. To fix this, you need to set the extractor property atomicFileProcessing=true.


    Michael Rys

    Saturday, April 30, 2016 12:19 AM
    Moderator
  • Yes, but i try run this code locally, not using DLA account.  And again I must say, like in my other posts, that AtomicFileProcessing mechanism does not work correctly for my custom extractors, and data is always splitting, and still I dont know what is happening.

    Sunday, May 1, 2016 9:54 PM
  • This is the code I ran several TBs of data through

    My files ended with gz so I commented out the GZip Stream line.

    I you post your full code up, maybe we could spot the problem.

    FYI: The ADL streams can only ready forward.

    using Microsoft.Analytics.Interfaces; using Microsoft.Analytics.Types.Sql; using System; using System.Collections.Generic; using System.IO.Compression; using System.Linq; using System.Text; namespace usqldm { [SqlUserDefinedExtractor(AtomicFileProcessing = true)] public class GZipExtractor : IExtractor { public GZipExtractor() { } public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // using (var archive = new GZipStream(input.BaseStream, CompressionMode.Decompress)) { using (var sr = new System.IO.StreamReader(input.BaseStream)) { Dictionary<string, int> headers = new Dictionary<string, int>(); string[] tmp = null; if (!sr.EndOfStream) { tmp = sr.ReadLine().Split('\t'); for (int i = 0; i < tmp.Length; i++) { headers.Add(tmp[i], i); } } while (!sr.EndOfStream) { var line = sr.ReadLine(); var cols = line.Split('\t'); foreach (var c in output.Schema) { int colidx; if (headers.TryGetValue(c.Name, out colidx)) { if (colidx < cols.Length) { output.Set<object>(c.Name, cols[colidx]); } } } yield return output.AsReadOnly(); } } } } } }


    -Brian-

    Tuesday, May 10, 2016 8:54 PM