C# Net 使用 openxml 提取ppt中的音频、视频、图片、文本
名称空间:
using System; using DocumentFormat.OpenXml.Packaging; using System.IO; using System.Linq; using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Presentation; using A = DocumentFormat.OpenXml.Drawing; using P14 = DocumentFormat.OpenXml.Office2010.PowerPoint;
代码如下:
public void PptInfo(string path) { using (var doc = PresentationDocument.Open(path, false)) { var presentation = doc.PresentationPart.Presentation; foreach (SlideId slideId in presentation.SlideIdList) { SlidePart slidePart = doc.PresentationPart.GetPartById(slideId.RelationshipId) as SlidePart; if (slidePart == null || slidePart.Slide == null) continue; //ppt中显示的真实编号 var SlideNumber = presentation.FirstSlideNum?.Value ?? 1 + presentation.SlideIdList.ToList().IndexOf(slideId); Slide slide = slidePart.Slide; //音频 var audioList = slide.Descendants<Audio>(); //视频 var videoList = slide.Descendants<Video>(); //图片 var picList = slide.CommonSlideData.ShapeTree.Descendants<Picture>().Where(o => !o.NonVisualPictureProperties.ApplicationNonVisualDrawingProperties.Any()); //文本框 var txBodyList = slide.CommonSlideData.ShapeTree.Descendants<TextBody>(); //提取音视频(将 audioList 换成 videoList )就行了 foreach (var media in audioList) { //音频关联的形状 var spTgt = media.CommonMediaNode.TargetElement.ShapeTarget; //形状属性 var cNvPr = slide.Descendants<NonVisualDrawingProperties>().FirstOrDefault(o => o.Id == spTgt.ShapeId); //形状信息 var ShapeId = cNvPr.Id.Value; var ShapeName = cNvPr.Name.Value; var ShapeDescr = cNvPr.Description?.Value; //上级和上上级 var nvPicPr = (NonVisualPictureProperties)cNvPr.Parent; var pic = (Picture)nvPicPr.Parent; //音频文件关联 var audioFile = nvPicPr.ApplicationNonVisualDrawingProperties.Elements<A.AudioFromFile>().FirstOrDefault(); ////视频文件关联 //var videoFile = nvPicPr.ApplicationNonVisualDrawingProperties.Elements<A.VideoFromFile>().FirstOrDefault(); //获取音视频文件 外部/内部 var uri = slidePart.ExternalRelationships.FirstOrDefault(o => o.Id == audioFile.Link)?.Uri;//外部关系 if (uri == null) { var dataPartReferenceRelationship = slidePart.DataPartReferenceRelationships.FirstOrDefault(o => o.Id == audioFile.Link);//内部关系 var mediaStream = dataPartReferenceRelationship.DataPart.GetStream(); uri = dataPartReferenceRelationship?.Uri; } //媒体文件关联的图片 var embed = pic.BlipFill.Blip.Embed.Value; var part = slidePart.GetPartById(embed); var imgStream = part.GetStream(); } //提取图片 foreach (var pic in picList) { var cNvPr = pic.NonVisualPictureProperties.NonVisualDrawingProperties; //形状信息 var ShapeId = cNvPr.Id.Value; var ShapeName = cNvPr.Name.Value; var ShapeDescr = cNvPr.Description?.Value; //获取图片 var embed = pic.BlipFill.Blip.Embed.Value; var part = slidePart.GetPartById(embed); var imgStream = part.GetStream(); } //提取文本 foreach (var txBody in txBodyList) { //上级 var sp = (Shape)txBody.Parent; //形状属性 var cNvPr = sp.NonVisualShapeProperties.NonVisualDrawingProperties; //形状信息 var ShapeId = cNvPr.Id.Value; var ShapeName = cNvPr.Name.Value; //获取文本信息 //方式1 var text = txBody.InnerText; //方式2 var texts = txBody.Descendants<A.Text>(); text = string.Join(null, texts.Select(o => o.Text)); //获取文本信息(含段落) var ps = txBody.Descendants<A.Paragraph>(); text = string.Join(Environment.NewLine, ps.Select(o => o.InnerText)); } } } }
ppt文档的形状结构大概为:
完毕