none
Processing xml attributes in custom xmlExtractor

    Question

  • I'm trying to parse a complex <g class="gr_ gr_17 gr-alert gr_spell gr_inline_cards gr_run_anim ContextualSpelling ins-del multiReplace" data-gr-id="17" id="17">xml</g> file. 

    <REC r_id_disclaimer="ResearcherID data provided by Thomson Reuters">
    <UID>WOS:A1945UX49300017</UID>
    <static_data>
    <summary>
    <EWUID>
    <WUID coll_id="WOS"/>
    <edition value="WOS.SCI"/>
    </EWUID>
    <pub_info coverdate="1945" has_abstract="N" issue="1" pubtype="Journal" pubyear="1945" sortdate="1945-01-01" vol="4">
    <page begin="6" end="6" page_count="1">6-6</page>
    </pub_info>
    <titles count="5">
    <title type="source">FEDERATION PROCEEDINGS</title>
    <title type="source_abbrev">FED PROC</title>
    <title type="abbrev_11">FED PROC</title>
    <title type="abbrev_29">FED PROC</title>
    <title type="item">INFLUENCE OF O-2 AT HIGH PRESSURE ON MALARIAL PARASITES</title>
    </titles>
    <names count="2">
    <name dais_id="341421" role="author" seq_no="1">
    <display_name>BEAN, JW</display_name>
    <full_name>BEAN, JW</full_name>
    <wos_standard>BEAN, JW</wos_standard>
    <first_name>JW</first_name>
    <last_name>BEAN</last_name>
    </name>
    <name dais_id="1338637" role="author" seq_no="2">
    <display_name>PORTER, RJ</display_name>
    <full_name>PORTER, RJ</full_name>
    <wos_standard>PORTER, RJ</wos_standard>
    <first_name>RJ</first_name>
    <last_name>PORTER</last_name>
    </name>
    </names>
    <doctypes count="1">
    <doctype>Meeting Abstract</doctype>
    </doctypes>
    <publishers>
    <publisher>
    <address_spec addr_no="1">
    <full_address>9650 ROCKVILLE PIKE, BETHESDA, MD 20814-3998</full_address>
    <city>BETHESDA</city>
    </address_spec>
    <names count="1">
    <name addr_no="1" role="publisher" seq_no="1">
    <display_name>FEDERATION AMER SOC EXP BIOL</display_name>
    <full_name>FEDERATION AMER SOC EXP BIOL</full_name>
    </name>
    </names>
    </publisher>
    </publishers>
    </summary>
    <fullrecord_metadata>
    <languages count="1">
    <language type="primary">English</language>
    </languages>
    <normalized_languages count="1">
    <language type="primary">English</language>
    </normalized_languages>
    <normalized_doctypes count="1">
    <doctype>Abstract</doctype>
    </normalized_doctypes>
    <references count="0"/>
    <addresses count="0"/>
    <category_info>
    <headings count="1">
    <heading>Science &amp; Technology</heading>
    </headings>
    <subheadings count="1">
    <subheading>Life Sciences &amp; Biomedicine</subheading>
    </subheadings>
    <subjects count="2">
    <subject ascatype="traditional">Biology</subject>
    <subject ascatype="extended">Life Sciences &amp; Biomedicine - Other Topics</subject>
    </subjects>
    </category_info>
    </fullrecord_metadata>
    <item xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="itemType_wos">
    <ids avail="Y">UX493</ids>
    <bib_id>4 (1): 6-6 1945</bib_id>
    </item>
    </static_data>
    <dynamic_data>
    <cluster_related>
    <identifiers>
    <identifier type="accession_no" value="UX493"/>
    <identifier type="issn" value="0014-9446"/>
    </identifiers>
    </cluster_related>
    </dynamic_data>
    </REC>

    I'm using the code from https://github.com/Azure/usql/tree/master/Examples/DataFormats/Microsoft.Analytics.Samples.Formats/Xml. But I'm unable to figure out how to process attributes. For example, I need to get the value of <pub_info>pubtype: and <language type="primary">English</language>. 

    Below is my code. 

            public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output)
            {
                
                // Make sure that all requested columns are of type string
                IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string));
                if (column != null)
                {
                    throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name));
                }
                // Microsoft.Analytics.Diagnostics.DiagnosticStream.WriteLine(System.String.Format(column.Name + ":" + column.Type.Name));
    
                var state = new ParseState();
                state.ClearAndJump(ParseLocation.Row);
                using (var reader = XmlReader.Create(input.BaseStream))
                {
                    while (reader.Read())
                    {
                        String xmlNodeName = reader.Name;
                        Microsoft.Analytics.Diagnostics.DiagnosticStream.WriteLine(System.String.Format("xmlNodeType" + ":" + (int)reader.NodeType));
                        switch (state.Location)
                        {
                            case ParseLocation.Row:
                                // when looking for a new row, we are only interested in elements
                                // whose name matches the requested row element
                                if (reader.NodeType == XmlNodeType.Element && xmlNodeName == this.rowPath)
                                {
                                    // when found, clear the IUpdatableRow's memory
                                    // (this is no provided Clear method)
                                    for (int i = 0; i < output.Schema.Count; i++)
                                    {
                                        output.Set<string>(i, null);
                                    }
    
                                    state.ClearAndJump(ParseLocation.Column);
                                }
    
                                break;
                            case ParseLocation.Column:
                                // When looking for a new column, we are interested in elements
                                // whose name is a key in the columnPaths map or
                                // whose name is in the requested output schema.
                                // This indicates a column whose value needs to be read, 
                                // so prepare for reading it by clearing elementValue.
                                if (reader.NodeType == XmlNodeType.Element
                                    && (this.columnPaths.ContainsKey(xmlNodeName)
                                        || output.Schema.Select(c => c.Name).Contains(xmlNodeName)))
                                {
                                    if (reader.IsEmptyElement)
                                    {
                                        // For an empty element, set an empty string 
                                        // and immediately jump to looking for the next column
                                        output.Set(this.columnPaths[xmlNodeName] ?? xmlNodeName, state.ReadElementValue());
                                        state.ClearAndJump(ParseLocation.Column);
                                    }
                                    else
                                    {
                                        state.Location = ParseLocation.Data;
                                        state.ElementName = xmlNodeName;
                                        state.ClearElementValue();
                                    }
                                }
                                else if (reader.NodeType == XmlNodeType.EndElement && xmlNodeName == this.rowPath)
                                {
                                    // The other interesting case is an end element whose name matches 
                                    // the current row element. This indicates the end of a row, 
                                    // so yield the now-complete row and jump to looking for 
                                    // another row.
                                    yield return output.AsReadOnly();
                                    state.ClearAndJump(ParseLocation.Row);
                                }
    
                                break;
                            case ParseLocation.Data:
                                // Most of the code for reading the value of a column
                                // deals with re-creating the inner XML from discrete elements.
                                // The only jump occurs when the reader hits an end element
                                // whose name matches the current column. In this case, we
                                // need to write the accumulated value to the appropriate 
                                // column in the output row.
                                
                                switch (reader.NodeType)
                                {
                                    case XmlNodeType.EndElement:
                                        if (xmlNodeName == state.ElementName)
                                        {
                                            output.Set(this.columnPaths[state.ElementName] ?? state.ElementName, state.ReadElementValue());
                                            state.ClearAndJump(ParseLocation.Column);
                                        }
                                        else
                                        {
                                            state.ElementWriter.WriteEndElement();
                                        }
    
                                        break;
                                    case XmlNodeType.Element:
                                        state.ElementWriter.WriteStartElement(xmlNodeName);
                                        
                
                                        state.ElementWriter.WriteAttributes(reader, false);
                                        if (reader.IsEmptyElement)
                                        {
                                            state.ElementWriter.WriteEndElement();
                                        }
    
                                        break;
                                    case XmlNodeType.CDATA:
                                        state.ElementWriter.WriteCData(reader.Value);
                                        break;
                                    case XmlNodeType.Comment:
                                        state.ElementWriter.WriteComment(reader.Value);
                                        break;
                                    case XmlNodeType.ProcessingInstruction:
                                        state.ElementWriter.WriteProcessingInstruction(reader.Name, reader.Value);
                                        break;
                                    case XmlNodeType.Attribute:
                                        Microsoft.Analytics.Diagnostics.DiagnosticStream.WriteLine(System.String.Format("xmlElementType : " + xmlNodeName));
                                        if (xmlNodeName.ToLower().Contains("::"))
                                        {
                                            var splitted = Regex.Split(xmlNodeName, "::");
                                            String attributeName = splitted[splitted.Length - 1];
                                            Microsoft.Analytics.Diagnostics.DiagnosticStream.WriteLine(System.String.Format("attributeName : " + attributeName));
                                            if (null != reader.GetAttribute(attributeName))
                                            {
                                                state.ElementWriter.WriteString(reader.Value);
                                            }
                                        }
                                        break;
                                    default:
                                        //Microsoft.Analytics.Diagnostics.DiagnosticStream.WriteLine(System.String.Format("xmlElementType : " + reader.Value));
                                        state.ElementWriter.WriteString(reader.Value);
                                        break;
                                }
    
                                break;
                            default:
                                throw new NotImplementedException("StreamFromXml has not implemented a new member of the ParseLocation enum");
                        }
                    }
    
                    if (state.Location != ParseLocation.Row)
                    {
                        throw new ArgumentException("XML document ended without proper closing tags");
                    }
                }
            }

    My <g class="gr_ gr_287 gr-alert gr_spell gr_inline_cards gr_run_anim ContextualSpelling ins-del multiReplace" data-gr-id="287" id="287">usql</g> code is below. 

    SET @@FeaturePreviews = "DIAGNOSTICS:ON";
    
    USE master; 
    REFERENCE ASSEMBLY [WOSExtractor];
    
    // copy to ADL 
    
    @wiki = 
    	EXTRACT city string, 
    	pub_info  string,
    	name string,
        doctype string     
    	FROM "/WoS/XML/1945/WR_1945_20140602130813_CORE_0001.xml" 
    	USING new WOSExtractor.WOSXMLExtractor("REC",
    		new SQL.MAP<string,string> { 
    			 {"city","city" },
    			 {"pub_info::pubtype","pub_info::pubtype" },
    			 {"name::display_name","name::display_name"},
    			 {"doctype", "doctype"}
                }
    	);
     
    @wos = SELECT w.doctype AS doctype,
                  w.city AS city,
                  w.pub_info AS pubInfo,
                  w.name AS name
            FROM @wiki AS w; 
    		
    
    OUTPUT @wos TO "output/parsedData/foo1.txt" USING Outputters.Csv();

    Appreciate your help on this. 

    Friday, September 21, 2018 7:36 PM

All replies

  • Hi Chathuri,

        Thanks for providing the scripts and letting us know what you tried but what I am missing to get his "What error are you facing"?


    Wednesday, September 26, 2018 5:41 PM
    Moderator
  • My output is like this. 

    Letter
    LONDON
    <page page_count="1">287-287</page>
    <display_name>LANCET LTD</display_name> <full_name>LANCET LTD</full_name>
    Article
    CAMBRIDGE
    <page page_count="1">211-211</page>
    <display_name>ROYAL SOC CHEMISTRY</display_name> <full_name>ROYAL SOC CHEMISTRY</full_name>
    Letter
    CHICAGO
    <page page_count="2">186-187</page>
    <display_name>UNIV CHICAGO PRESS</display_name> <full_name>UNIV CHICAGO PRESS</full_name>
    Article
    PHILADELPHIA
    <page page_count="7">998-1004</page>
    <display_name>W B SAUNDERS CO</display_name> <full_name>W B SAUNDERS CO</full_name>

    For the 3rd and 4th column, it prints the whole <g class="gr_ gr_67 gr-alert gr_spell gr_inline_cards gr_run_anim ContextualSpelling ins-del multiReplace" data-gr-id="67" id="67">xml</g> node, instead of the value of the attribute. It does not give any error. Everything runs successfully. 

    Wednesday, September 26, 2018 5:47 PM