none
Using System.IO.Packaging to Exctract the document.xml from Word 2007 document RRS feed

  • Question

  • Woud someone be kind enough to point to a good example of extracting the document.xml from a Word 2007 document using System.IO.Packaging and the ZipPackage class.

    I have the following code:

    const string documentRelationshipType =
                            "http://schemas.openxmlformats.org/officeDocument/" +
                            "2006/relationships/officeDocument";
    
                        string filePath = @"F:\projects\visualStudio\WPF\WordDriver\WordDriver\Data\";
                        string fileName = @"Test.docx";
                        string docName = filePath + fileName;
                        PackagePart documentPart = null;
    
                        using (Package wdPackage = Package.Open(
                                docName, FileMode.Open, FileAccess.ReadWrite))
                        {
                            //  Get the main document part (document.xml).
                            foreach (PackageRelationship relationship in
                              wdPackage.GetRelationshipsByType(documentRelationshipType))
                            {
                                Uri documentUri = PackUriHelper.ResolvePartUri(
                                  new Uri("/", UriKind.Relative), relationship.TargetUri);
                                documentPart = wdPackage.GetPart(documentUri);
                                //  There is only one document.
                                break;
                            }
                        }
    I just need to see some sample code for manipulating the documentPart once it finds the document.xml part.

    Or, if I am completely off base, someone point me in the right direction.

    Thanks,

    Rob
    Sunday, June 21, 2009 3:16 PM

All replies

  • Here are 2 links which can help:

    How to: Manipulate Office Open XML Formats Documents
    http://msdn.microsoft.com/en-us/library/aa982683.aspx

    If you are using Open XML SDK, here is the link:

    http://msdn.microsoft.com/en-us/library/bb656295.aspx

    How to: Get the Contents of a Document Part from an Office Open XML Package by Using the Open XML API
    http://msdn.microsoft.com/en-us/library/bb497448.aspx

    How Do I
    http://msdn.microsoft.com/en-us/library/bb491088.aspx

    HTH,
    Ankush
    Monday, June 22, 2009 9:21 PM
  • Thanks for those links, they helped.  I am still getting some confusiong behavior.  I have to a Test.docx file which was loaded with the =lorem() method. 

    I extracted manually the document.xml and created two files from it.  One, the entire document.xml, and the other just the w:body part.

    I can successfully get the "w:p" nodes using XmlDocument (doc1), but cannot successfully use XDocument loading from an XML string or an XmlReader loaded from a Stream (documentPart.GetStream()).  But I can succesfully get the nodes if I load from a file.

    I want to get the documentPart and then from the GetStream() method load an XDocument and traverse the w:p nodes. 

    I bet I am missing something rather simple ;)

    const string documentRelationshipType = 
                            "http://schemas.openxmlformats.org/officeDocument/" +
                            "2006/relationships/officeDocument";
                        const String dcPropertiesSchema =
                            "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
    
                        string filePath = @"F:\projects\visualStudio\WPF\WordDriver\WordDriver\Data\";
                        string fileName1 = @"Test.docx";
                        string fileName2 = @"body.xml";
                        string fileName3 = @"document.xml";
                        string docName = filePath + fileName1;
    
                        string xmlFile = filePath + fileName3; // use fileName2 or fileName3
                        
                        PackagePart documentPart = null;
    
                        using (Package wdPackage = Package.Open(
                                docName, FileMode.Open, FileAccess.ReadWrite))
                        {
                            //  Get the main document part (document.xml).
                            foreach (PackageRelationship relationship in
                              wdPackage.GetRelationshipsByType(documentRelationshipType))
                            {
                                Uri documentUri = PackUriHelper.ResolvePartUri(
                                  new Uri("/", UriKind.Relative), relationship.TargetUri);
                                documentPart = wdPackage.GetPart(documentUri);
    
                                NameTable nt = new NameTable();
                                XmlNamespaceManager nsmgr = new XmlNamespaceManager(nt);
                                nsmgr.AddNamespace("w", dcPropertiesSchema);
    
                                // This Works
                                XmlDocument doc1 = new XmlDocument(nt);
                                doc1.Load(documentPart.GetStream());
                                XmlNodeList ndlst = doc1.SelectNodes("/w:document/w:body/w:p", nsmgr);
                               
                                string xmlDocString = doc1.InnerXml;
                           
                                XNamespace w =
                                    "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
    
                                
                                // Load from string created from XmlDocument created from doc1
                                XDocument doc2 = XDocument.Parse(xmlDocString);
                                XElement root2 = doc2.Root;
    
                                // Returns Empty
                                IEnumerable<XElement> xe2 =
                                    from el2 in root2.Elements(w + "p")
                                    select el2;
    
                                foreach (XElement e2 in xe2)
                                    Console.WriteLine(e2.HasElements);
    
                                // Load from XmlReader loaded from Stream
                                XmlReader reader = XmlReader.Create(documentPart.GetStream());
                                //reader.MoveToContent();
                              
                                // Yet this loads the xml
                                XDocument doc3 = XDocument.Load(reader);
                                XElement root3 = doc3.Root;
    
                                // Returns empty
                                IEnumerable<XElement> xe3 =
                                    from el3 in root3.Elements(w + "p")
                                    select el3;
    
                                foreach (XElement e3 in xe3)
                                    Console.WriteLine(e3.HasElements);
                            
    
                                // Load from File
                                XDocument doc4 = XDocument.Load(xmlFile);
                                XElement root4 = doc4.Root;
    
                                // Returns With XElements
                                IEnumerable<XElement> xe4 =
                                    from el4 in root4.Elements(w + "p")
                                    select el4;
    
                                foreach (XElement e4 in xe4)
                                    Console.WriteLine(e4.HasElements);
    
    Tuesday, June 23, 2009 2:40 PM