none
Load large nested xml file into sql server tables using Custom XMLTextReader and SQLBulkcopy ( IDataReader). RRS feed

  • Вопрос

  • Hi,
    i have insert a huge xml file (2 GB size) to sql server database. The xml file is nested with child elements. The data should go to 3 different tables. I searxhed on internet and arrived at using XMLTextReader and SQLBulkcopy.

    Since my xml is nested, somebody mentioned to implement a custom IDataReader. But i am not able to find a correct sample code to achieve this.

    How do i read the xml element values to 3 different tables. I was planning to implement 3 sqlbulkcopy implementations

    for 3 different IDataReaders ( 1 for each sub-node types). ie reading the xml 3 times and extract the target table data 3 times.

    My main concern is how do i map the innder nodes of xml file to get the column values for the destination table.

    XML Format :
    ------------------
    <root_elementroot_element>

    <FileMetaData>

    <FileName>root_element_201312020304.XML</FileName>

    <FileID>312020304</FileID>

    <SenderID>571299948</SenderID>

    <SentDate>20131202</SentDate>

    <SentTime>03:04:00</SentTime>

    <ReceiverID></ReceiverID>

    </FileMetaData>

    <Sender>

    <Address>

    <PrimaryStreet>123 street</PrimaryStreet>

    <TaxID>1099948</TaxID>

    <InsurerName></InsurerName>

    <InsurerID></InsurerID>

    <Sponsors>

    <Sponsor>

    <Address>

    <PrimaryStreet>200 1st st</PrimaryStreet>

    </Address>

    <Name>abc Foundation for testResearch</Name>

    <TaxID>4115ss0</TaxID>

    <Contracts>

    <Contract>

    <Code_Se>112478514</Code_Se>

    <Metadata>

    <TransactionType>ADD</TransactionType>

    </Metadata>

    <Member> 

    <Metadata>

    <TransactionType>ADD</TransactionType>

    </Metadata>

    <Address><PrimaryStreet>77 Broadway</PrimaryStreet></Address>

    <FirstName>fnametest</FirstName>

    <LastName>lnametest</LastName>

    <Relationship>SUBSCRIBER</Relationship>

    <PayrollID>300074</PayrollID>

    <rcode>112478514</rcode>

    <Gender>F</Gender>

    <PersonType>S</PersonType>

    <BirthDate>1230101</BirthDate>

    <MaritalStatus>SINGLE</MaritalStatus>

    <Ethnicity>WH</Ethnicity>

    <EffectiveChangeDate>20131018</EffectiveChangeDate>

    <EmploymentStatus>FT</EmploymentStatus>

    <BenefitStatus>ACCEPTED</BenefitStatus>

    <Benefits>

    <Benefit BenefitType='ADD'>

    <TransactionType>AD</TransactionType>

    <CoverageIndicator>true</CoverageIndicator>

    <CoverageAmount>116000</CoverageAmount>

    <OriginalEffectiveDate>20131023</OriginalEffectiveDate>

    <CoverageEffectiveDate>20131023</CoverageEffectiveDate>

    <!--second child table, t_member_benefit_carrieridentifier  starts from here  -->

    <CarrierIdentifiers><CarrierIdentifier>

    <Value>E168</Value>

    <EffectiveDate>20131023</EffectiveDate>

    <Name>PAYROLL_ATTRIBUTE_3</Name>

    </CarrierIdentifier>

    <CarrierIdentifier>

    <Value>45203</Value>

    <EffectiveDate>20131023</EffectiveDate><Name>CONTROL_NBR</Name>

    </CarrierIdentifier><CarrierIdentifier><EffectiveDate>20131023</EffectiveDate>

    <Name>PAYROLL_ATTRIBUTE_2</Name></CarrierIdentifier><CarrierIdentifier>

    <EffectiveDate>20131023</EffectiveDate><Name>PAYROLL_ATTRIBUTE_1</Name>

    </CarrierIdentifier></CarrierIdentifiers>

    <EnrollmentStatus>TIMELY</EnrollmentStatus>

    <SalaryMultiplier>01000000</SalaryMultiplier><Premium>

    <TotalCost>27.84</TotalCost><SponsorCost>27.84</SponsorCost>

    <MemberCost>0.00</MemberCost>

    <TotalBenefitProgramAmount>0.00</TotalBenefitProgramAmount>

    <!-- 3rrd table, t_member_categories nodes starts from here....-->

     - <Categories>
    - <Category>
      <Value>N</Value>
      <EffectiveDate>20131121</EffectiveDate>
      <Name>ASD before 1/1/11</Name>
      </Category>
    - <Category>
      <Value>N</Value>
      <EffectiveDate>20131121</EffectiveDate>
      <Name>Linked Spouse</Name>
      </Category>
      etc etc
      </Categories>

    </Premium></Benefit>
    </member></contract>
    ----------------------
    ------------------------
    etc etx
     </Contracts>
    </Sponsor>
    </Sponsors>
    </Sender>
    </root_element>

    Destination tables
    -------------------------

    1.t_main_member_table
    2.t_member_benefit_carrieridentifier  /* first set of child nodes */
    2.t_member_categories  /* second set of child nodes */

    • Изменено gj08 22 декабря 2013 г. 5:26
    22 декабря 2013 г. 4:58

Ответы

  • Below is how I would do it.  You are going to return 3 different type from the function that reads the XML file.  So I normally create a structure which has a type item and then the data that gets returned.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Data;
    using System.Data.SqlClient;
    
    namespace ConsoleApplication1
    {
        public enum RecordType
        {
            Type1,
            Type2,
            Type3
    
        }
        class Program
        {
            static SqlConnection conn = new SqlConnection();
            static SqlCommand[] cmds = new SqlCommand[3];
            static void Main(string[] args)
            {
                string SQL1 = "Insert ....   ";
                string SQL2 = "Insert ....   ";
                string SQL3 = "Insert ....   ";
    
                cmds[(int)RecordType.Type1] = new SqlCommand(SQL1, conn);
                cmds[(int)RecordType.Type2] = new SqlCommand(SQL2, conn);
                cmds[(int)RecordType.Type3] = new SqlCommand(SQL3, conn);
    
    
            }
            private static Record WriteSingleRow()
            {
                Record record = ReadRecord();
                
                if (record.record != null)
                {
                    switch (record.type)
                    {
                        case RecordType.Type1:
                            cmds[(int)RecordType.Type1].ExecuteNonQuery();
                            break;
                        case RecordType.Type2:
                            cmds[(int)RecordType.Type2].ExecuteNonQuery();
                            break;
                        case RecordType.Type3:
                            cmds[(int)RecordType.Type3].ExecuteNonQuery();
                            break;
                    }
                }
                return record;
    
            }
            private static Record ReadRecord()
            {
                Record record = new Record();
                //read xml
                return record;
            }
    
        }
        public struct Record
        {
            public RecordType type { get; set; }
            public IDataRecord record { get; set; }
        }
        interface Record1 : IDataRecord
        {
        }
        interface Record2 : IDataRecord
        {
        }
        interface Record3 : IDataRecord
        {
        }
    }
    


    jdweng

    • Помечено в качестве ответа Eason_H 30 декабря 2013 г. 1:41
    24 декабря 2013 г. 9:52

Все ответы

  • I recommended code like on the webpage below a few days ago which read an XML file without putting the entire file into memory.  You could put the code in the website into a custom IDataReader.

    http://support.microsoft.com/kb/307548/en-us


    jdweng

    22 декабря 2013 г. 7:45
  • Hi,

    reading large Xml files can be done using a XmlReader as Joel already suggested. LINQ to Xml also supports a streaming mode based on XmlReader, see How to: Perform Streaming Transform of Large XML Documents. The resulting IEnumerable can be transformed into a IDataReader, for example Converting IEnumerable<T> to IDataReader.

    For the mapping: As I cannot identify primary key candidates in the sample data, an additional Guid may be necessary to get a primary key for each entity. To get an sequential Guid like newsequentialid to prevent fragmentation, see http://stackoverflow.com/questions/211498/is-there-a-net-equalent-to-sql-servers-newsequentialid.

    If the xml data contains duplicate information, e. g. addresses, categories, additional steps may be necessary.

    Regards, Elmar

    22 декабря 2013 г. 9:59
  • Elmar : The sample code I posted uses XML.Linq.  Just didn't want to confuse anybody.

    jdweng

    22 декабря 2013 г. 11:53
  • Still i am not able to get the handle on how to do this. Can someone put a little details steps on how to do it. I am having issue on , how to read the child elements and map into IDadaReader custom class

     static void Main()
           
            {

                //2 GB xml file with nested child nodes ( planning to process xmlbulkcopy 3 times to send the source data into 3 target tables)
                string xmlPath = @"c:\temp\source.xml";

                //step1 : load t_main_member_table
                using (XmlTextReader xmlTextReader = new XmlTextReader(xmlPath))
                {
                    using (SqlBulkCopy sqlBulkCopy = new SqlBulkCopy(connectionString, SqlBulkCopyOptions.TableLock))
                    {
                        SetupBulkCopy(sqlBulkCopy, "t_main_member_table");
                        var reader = new CustomMemberReader(xmlTextReader);
                        sqlBulkCopy.WriteToServer((IDataReader)reader);
                    }
                }

                //step2 : load t_member_benefit

                //step3 : load t_member_benefit_carrieridentifier


            }


            private static void SetupBulkCopy(SqlBulkCopy sqlBulkCopy,string tablename) {
                    sqlBulkCopy.DestinationTableName = tablename;
            }

    /*===================== generic CustomMemberReader   class ================================*/

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Xml;

    namespace TestXML1
    {
        class CustomMemberReader  : XmlDataReader

        {
            private const string XmlTagRow = "Member";
            private const int FieldCount = 30;
            private const int InvalidItemId = -1;


            public CustomMemberReader(XmlReader xmlReader) : base(xmlReader, FieldCount, XmlTagRow) { }

            public override object GetValue(int i)
            {
             //not sure how to put this.....

                switch (i)
                {


                    case 0: return CurrentElement.Element("column_name1").Value;
                    case 1: return CurrentElement.Element("trasnsaction_type").Value;
                    case 2: return CurrentElement.Element("PrimaryStreet").Value;
                    case 3: return CurrentElement.Element("SecondaryStreet").Value;
                    case 4: return CurrentElement.Element("City").Value;
                    case 5: return CurrentElement.Element("State").Value;
                    case 6: return CurrentElement.Element("PostalCode").Value;
                    case 7: return CurrentElement.Element("CountryCode").Value;
                    case 8: return CurrentElement.Element("FirstName").Value;
                    case 9: return CurrentElement.Element("MiddleName").Value;
                    case 10: return CurrentElement.Element("LastName").Value;
                    case 11: return CurrentElement.Element("relationship").Value;
                    case 12: return CurrentElement.Element("PayrollID").Value;
                    case 13: return CurrentElement.Element("filed_name").Value;
                    case 14: return CurrentElement.Element("gender").Value;
                    case 15: return CurrentElement.Element("person_type").Value;
                    case 16: return CurrentElement.Element("BirthDate_char").Value;
                    case 17: return CurrentElement.Element("DeceasedDate_char").Value;
                    case 18: return CurrentElement.Element("marital_status").Value;
                    case 19: return CurrentElement.Element("ethinicity").Value;
                    case 20: return CurrentElement.Element("EffectiveChangeDate_char").Value;
                    case 21: return CurrentElement.Element("EmploymentStatus").Value;
                    case 22: return CurrentElement.Element("BenefitStatus").Value;
                    case 23: return CurrentElement.Element("Occupation").Value;
                    case 24: return CurrentElement.Element("EarningsAmount_char").Value;
                    case 25: return CurrentElement.Element("EarningsClass").Value;
                    //etc etc

                    default:

                        throw new InvalidOperationException("Column count mismatch.");
                }
            }    

        }
    }

    /*======================================================================================================*/

    /* generic xmldatareader class  */
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Xml;
    using System.Xml.Linq;
    using System.Data;

    namespace TestXML1
    {
        public abstract class XmlDataReader : IDataReader
        {

            private readonly string m_rowElementName;
            private readonly XmlReader m_xmlReader;
            private readonly int m_fieldCount = -1;
            private bool m_disposed;
            protected IEnumerator<XElement> m_enumerator;

            public abstract object GetValue(int i);

            /// <summary>    /// Initialize the XmlDataStreamer. After initialization call Read() to move the reader forward.    /// </summary>   

            ///param name="xmlReader">XmlReader used to iterate the data. Will be disposed by when done.</param> 
            /// <param name="fieldCount">IDataReader FiledCount.</param>  
            /// <param name="rowElementName">Name of the XML element that contains row data</param>
            public XmlDataReader(XmlReader xmlReader, int fieldCount, string rowElementName)
            {
                m_rowElementName = rowElementName;
                m_fieldCount = fieldCount;
                m_xmlReader = xmlReader;
                m_enumerator = GetXmlStream().GetEnumerator();
            }

            public bool Read()
            {
                return m_enumerator.MoveNext();
            }

            public int FieldCount
            {
                get { return m_fieldCount; }
            }

            public XElement CurrentElement
            {
                get { return m_enumerator.Current; }
            }
                  
            private IEnumerable<XElement> GetXmlStream()
            {
                XElement rowElement;           
                using (m_xmlReader)
                {
                    m_xmlReader.MoveToContent();
                    while (m_xmlReader.Read())
                    {
                        if (IsRowElement())
                        {
                            rowElement = XElement.ReadFrom(m_xmlReader) as XElement;
                            if (rowElement != null)
                            {
                                yield return rowElement;
                            }
                        }
                    }
                }
            }

            private bool IsRowElement() {
                if (m_xmlReader.NodeType != XmlNodeType.Element)
                    return false;
                return m_xmlReader.Name == m_rowElementName;
            }

           
            protected virtual void Dispose() {
                if (m_disposed) 
                    return;
                m_enumerator.Dispose();
                m_disposed = true;
            }


            #region IDataReader Members

            public void Close()
            {
                throw new NotImplementedException();
            }

            public int Depth
            {
                get { throw new NotImplementedException(); }
            }

            public DataTable GetSchemaTable()
            {
                throw new NotImplementedException();
            }

            public bool IsClosed
            {
                get { throw new NotImplementedException(); }
            }

            public bool NextResult()
            {
                throw new NotImplementedException();
            }

            public int RecordsAffected
            {
                get { throw new NotImplementedException(); }
            }

            #endregion

            #region IDisposable Members

            void IDisposable.Dispose()
            {
                throw new NotImplementedException();
            }

            #endregion

            #region IDataRecord Members


            public bool GetBoolean(int i)
            {
                throw new NotImplementedException();
            }

            public byte GetByte(int i)
            {
                throw new NotImplementedException();
            }

            public long GetBytes(int i, long fieldOffset, byte[] buffer, int bufferoffset, int length)
            {
                throw new NotImplementedException();
            }

            public char GetChar(int i)
            {
                throw new NotImplementedException();
            }

            public long GetChars(int i, long fieldoffset, char[] buffer, int bufferoffset, int length)
            {
                throw new NotImplementedException();
            }

            public IDataReader GetData(int i)
            {
                throw new NotImplementedException();
            }

            public string GetDataTypeName(int i)
            {
                throw new NotImplementedException();
            }

            public DateTime GetDateTime(int i)
            {
                throw new NotImplementedException();
            }

            public decimal GetDecimal(int i)
            {
                throw new NotImplementedException();
            }

            public double GetDouble(int i)
            {
                throw new NotImplementedException();
            }

            public Type GetFieldType(int i)
            {
                throw new NotImplementedException();
            }

            public float GetFloat(int i)
            {
                throw new NotImplementedException();
            }

            public Guid GetGuid(int i)
            {
                throw new NotImplementedException();
            }

            public short GetInt16(int i)
            {
                throw new NotImplementedException();
            }

            public int GetInt32(int i)
            {
                throw new NotImplementedException();
            }

            public long GetInt64(int i)
            {
                throw new NotImplementedException();
            }

            public string GetName(int i)
            {
                throw new NotImplementedException();
            }

            public int GetOrdinal(string name)
            {
                throw new NotImplementedException();
            }

            public string GetString(int i)
            {
                throw new NotImplementedException();
            }

            public int GetValues(object[] values)
            {
                throw new NotImplementedException();
            }

            public bool IsDBNull(int i)
            {
                throw new NotImplementedException();
            }

            public object this[string name]
            {
                get { throw new NotImplementedException(); }
            }

            public object this[int i]
            {
                get { throw new NotImplementedException(); }
            }

            #endregion
        }
    }



    • Изменено gj08 23 декабря 2013 г. 16:16
    23 декабря 2013 г. 16:04
  • The reader is going to fill a SQLCommand.  You have an SQL the looks something like below.  You will setup commands to load the tables.  Then the reader will load the command with data.

           static void Main(string[] args)
            {
                string connStr = "Enter Your Connection String Here"; 
                string SQL = "Insert into Table (a,b,c) Values('@a', '@b','@c')";
    
                SqlConnection conn = new SqlConnection(connStr);
                conn.Open();
    
                SqlCommand cmd = new SqlCommand(SQL, conn);
                cmd.Parameters.Add("@a", System.Data.SqlDbType.VarChar);
                cmd.Parameters.Add("@b", System.Data.SqlDbType.VarChar);
                cmd.Parameters.Add("@c", System.Data.SqlDbType.VarChar);
    
    
                //your reader will do the following
                cmd.Parameters["@a"].Value = "abc";
                cmd.Parameters["@b"].Value = "def";
                cmd.Parameters["@c"].Value = "ghi";
    
            }


    jdweng


    You also need to perform a "cmd.ExecuteNonQuery"
    • Изменено Joel Engineer 23 декабря 2013 г. 17:31
    23 декабря 2013 г. 17:26
  • How to use sqlBulkcopy ?
    23 декабря 2013 г. 17:45
  • The example only show a SELECT and not a INSERT.  It tool me a while to figure this out.

    1) Look at webpage below 

    http://msdn.microsoft.com/en-us/library/system.data.sqlclient.sqldatareader.read(v=vs.110).aspx

          a) The ReadSingleRow is reading one row of the database - Not one row of XML

          b) You will replace with WriteSingleRow

          c) The write will return a IDataRecord.

          d)  Change your IDataReader to a IDataRecord

    I put together the code below using a sample from a book I have.  It is untested.  I'm not sure what advantage there is to a bulk copy so I also included a WriteSingleRow.  A bulk copy would make sense if you XML file was using a stream class. You could then have two threads. One reading the XML file and putting results into the IDataRecord interface and a second thread reading the IDataRecord into the database.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Data.SqlClient;
    using System.Data;
    
    namespace ConsoleApplication1
    {
        class Program
        {
            static void Main(string[] args)
            {
            string connStr = "Enter Your connection string here";
            ReadOrderData(connStr);
                
            }
    
            static System.Data.IDataReader writemultipleRows = new WritemultipleRows() as System.Data.IDataReader;
            private static void ReadOrderData(string connectionString)
            {
                
                string queryString = "Insert SQL";
    
                using (SqlConnection connection =
                       new SqlConnection(connectionString))
                {
                   SqlCommand command = new SqlCommand(queryString, connection);
                   connection.Open();
    
                   using (SqlBulkCopy bulkCopy =
                              new SqlBulkCopy(connection))
                   {
                       bulkCopy.DestinationTableName =
                           "dbo.BulkCopyDemoMatchingColumns";
    
                       try
                       {
                           // Write from the source to the destination.
                           bulkCopy.WriteToServer(writemultipleRows);
                       }
                       catch (Exception ex)
                       {
                           Console.WriteLine(ex.Message);
                       }
                       finally
                       {
                           // Close the SqlDataReader. The SqlBulkCopy 
                           // object is automatically closed at the end 
                           // of the using block.
    
                       }
                   }
    
    
                   // Call Read before accessing data.
                   IDataRecord record = null;
                   while ((record = WriteSingleRow()) != null)
                   {
                   }
                }
    
            }
            public interface IDataReader : IDisposable, IDataRecord
            {
                int Depth { get; }
                bool IsClosed { get; }
                int RecordsAffected { get; }
                void Close();
                bool NextResult();
                bool Read();
            }
            public class WritemultipleRows : IDataReader
            {
    
                List<IDataRecord> records = new List<IDataRecord>();
    
    
                public List<IDataRecord>.Enumerator GetEnumerator()
                {
                    // Return the array object's IEnumerator.
                    return records.GetEnumerator();
                }
                public bool Read()
                {
                    //read xml file
                    return true;
                }
                public bool NextResult()
                {
                    return true;
                }
                public int RecordsAffected { get; set; }
                public bool IsClosed { get; set; }
                public int Depth { get; set; }
    
                
    
                
                public void Close()
                {
                }
                public void Dispose()
                {
                }
                
            }
            
            private static IDataRecord WriteSingleRow()
            {
                IDataRecord record =  null;
                //add code to read XML
                return record;
    
            }
    
        }
        interface IDataRecord
        {
            //int FieldCount { get; }
            //object this[string name] { get; }
            //object this[int i] { get; }
            //bool GetBoolean(int i);
            //byte GetByte(int i);
            //char GetChar(int i);
            //DateTime GetDateTime(int i);
            //Decimal GetDecimal(int i);
            //float GetFloat(int i);
            //short GetInt16(int i);
            //int GetInt32(int i);
            //long GetInt64(int i);
            //bool IsDBNull(int i);
        }
    
        
    }
    


    jdweng

    23 декабря 2013 г. 23:36
  • What would be appx code for  reading the xml file ? Remember the xml file is nested with child nodes of varying levels and should go to different tables.

    private static IDataRecord WriteSingleRow()
           
    {
               
    IDataRecord record =  null;
               
    //add code to read XML
               
    return record;

           
    }

    24 декабря 2013 г. 6:14
  • Below is how I would do it.  You are going to return 3 different type from the function that reads the XML file.  So I normally create a structure which has a type item and then the data that gets returned.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Data;
    using System.Data.SqlClient;
    
    namespace ConsoleApplication1
    {
        public enum RecordType
        {
            Type1,
            Type2,
            Type3
    
        }
        class Program
        {
            static SqlConnection conn = new SqlConnection();
            static SqlCommand[] cmds = new SqlCommand[3];
            static void Main(string[] args)
            {
                string SQL1 = "Insert ....   ";
                string SQL2 = "Insert ....   ";
                string SQL3 = "Insert ....   ";
    
                cmds[(int)RecordType.Type1] = new SqlCommand(SQL1, conn);
                cmds[(int)RecordType.Type2] = new SqlCommand(SQL2, conn);
                cmds[(int)RecordType.Type3] = new SqlCommand(SQL3, conn);
    
    
            }
            private static Record WriteSingleRow()
            {
                Record record = ReadRecord();
                
                if (record.record != null)
                {
                    switch (record.type)
                    {
                        case RecordType.Type1:
                            cmds[(int)RecordType.Type1].ExecuteNonQuery();
                            break;
                        case RecordType.Type2:
                            cmds[(int)RecordType.Type2].ExecuteNonQuery();
                            break;
                        case RecordType.Type3:
                            cmds[(int)RecordType.Type3].ExecuteNonQuery();
                            break;
                    }
                }
                return record;
    
            }
            private static Record ReadRecord()
            {
                Record record = new Record();
                //read xml
                return record;
            }
    
        }
        public struct Record
        {
            public RecordType type { get; set; }
            public IDataRecord record { get; set; }
        }
        interface Record1 : IDataRecord
        {
        }
        interface Record2 : IDataRecord
        {
        }
        interface Record3 : IDataRecord
        {
        }
    }
    


    jdweng

    • Помечено в качестве ответа Eason_H 30 декабря 2013 г. 1:41
    24 декабря 2013 г. 9:52