zoukankan      html  css  js  c++  java
  • .NET实现 数据提取、转换和加载

    早晨看到一篇文章:

    http://www.codeproject.com/Articles/34556/Write-ETL-jobs-in-pure-C

    有二个文件:User name file(ID NAME)、Addresses file(ID ADDRESS):

    Id Name
    1  Bob

    Id Address
    1 123 Main St.
    2 42 Everywhich way
    如果想得到
    Id Name Address 如果在SQL中一个查询 就可得到。 在文件处理的过程中,基于c# 如何处理呢? 代码中给出了解决方案。

    首先把二个文件的内容读取:
    FileEngine是基于 FileHelpers 
    public class UserNameRead : AbstractOperation
    {
        public UserNameRead(string filePath)
        {
            this.filePath = filePath;
        }
    
        string filePath = null;
    
        public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
        {
            using (FileEngine file = FluentFile.For<UserNameRecord>().From(filePath))
            {
                foreach (object obj in file)
                {
                    yield return Row.FromObject(obj);
                }
            }
        }
    }
    
    public class UserAddressRead : AbstractOperation
    {
        public UserAddressRead(string filePath)
        {
            this.filePath = filePath;
        }
    
        string filePath = null;
    
        public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
        {
            using (FileEngine file = FluentFile.For<UserAddressRecord>().From(filePath))
            {
                foreach (object obj in file)
                {
                    yield return Row.FromObject(obj);
                }
            }
        }
    }

    创建二个文件的关系并构造新的文件
    public class JoinUserRecords : JoinOperation
    {
        protected override void SetupJoinConditions()
        {
            InnerJoin
                .Left("Id")
                .Right("Id");
        }
    
        protected override Row MergeRows(Row leftRow, Row rightRow)
        {
            Row row = new Row();
            row.Copy(leftRow);
    
            //copy over all properties not in the user records
            row["Address"] = rightRow["Address"];
    
            return row;
        }
    }


    创建好的结构 如何输出:
    public class UserFullWrite : AbstractOperation
    {
        public UserFullWrite(string filePath)
        {
            this.filePath = filePath;
        }
    
        string filePath = null;
    
        public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
        {
            FluentFile engine = FluentFile.For<UserFullRecord>();
            engine.HeaderText = "Id\tName\tAddress";
            using (FileEngine file = engine.To(filePath))
            {
                foreach (Row row in rows)
                {
                    file.Write(row.ToObject<UserFullRecord>());
    
                    //pass through rows if needed for another later operation 
                    yield return row;
                }
            }
        }
    }

    调用方法:
    public class MainProcess : EtlProcess
    {
        protected override void Initialize()
        {
            Register(new JoinUserRecords()
                .Left(new UserNameRead(Settings.Default.NamesFile))
                .Right(new UserAddressRead(Settings.Default.AddressesFile))
            );
    
            Register(new UserFullWrite(Settings.Default.OutputFile));
        }
    }

    总结: 对于结构化的文件 , 通常比较好处理,但是对于非结构化的文件处理,不易处理。
    
    
  • 相关阅读:
    my read travel
    OS + CentOS 7 / centos 7 / config / configuration / rescue / rc.local / yum
    my soft / win soft
    如何撰写发明专利申请文件
    专利局审查员如何审专利
    国际专利分类表(2016版)
    手把手教你写专利申请书/如何申请专利
    Packets switched through Linux bridge have very high packet delay variation and latency?
    当心僵尸:过时Linux内核的安全风险
    飞漫魏永明:从MiniGUI看嵌入式十年得与失
  • 原文地址:https://www.cnblogs.com/chenqingwei/p/2808347.html
Copyright © 2011-2022 走看看