zoukankan      html  css  js  c++  java
  • mongodb的数据迁移与同步工具 mongoshake

    ##########

    [work@10.10.10.10]$ cat collector.conf
    # if you have any problem, please visit https://github.com/alibaba/MongoShake/wiki/FAQ
    # for the detail explanation, please visit xxxx
    # 如果有问题,请先查看FAQ文档以及wiki上的说明。
    # 关于各个参数的详细说明,请参考:xxx
    
    # current configuration version, do not modify.
    # 当前配置文件的版本号,请不要修改该值。
    conf.version = 2
    
    # --------------------------- global configuration ---------------------------
    # collector name
    # id用于输出pid文件等信息。
    id = mongoshake
    
    # high availability option.
    # enable master election if set true. only one mongoshake can become master
    # and do sync, the others will wait and at most one of them become master once 
    # previous master die. The master information stores in the `mongoshake` db in the source 
    # database by default.
    # This option is useless when there is only one mongoshake running.
    # 如果开启主备mongoshake拉取同一个源端,此参数需要开启。
    master_quorum = false
    
    # http api interface. Users can use this api to monitor mongoshake.
    # `curl 127.0.0.1:9100`.
    # We also provide a restful tool named "mongoshake-stat" to
    # print ack, lsn, checkpoint and qps information based on this api.
    # usage: `./mongoshake-stat --port=9100`
    # 全量和增量的restful监控端口,可以用curl查看内部监控metric统计情况。详见wiki。
    full_sync.http_port = 9101
    incr_sync.http_port = 9100
    # profiling on net/http/profile
    # profiling端口,用于查看内部go堆栈。
    system_profile_port = 9200
    
    # global log level: debug, info, warning, error. lower level message will be filter
    log.level = info
    # log directory. log and pid file will be stored into this file.
    # if not set, default is "./logs/"
    # log和pid文件的目录,如果不设置默认打到当前路径的logs目录。
    log.dir =
    # log file name.
    # log文件名。
    log.file = collector.log
    # log flush enable. If set false, logs may not be print when exit. If
    # set true, performance will be decreased extremely
    # 设置log刷新,false表示包含缓存,如果true那么每条log都会直接刷屏,但对性能有影响;
    # 反之,退出不一定能打印所有的log,调试时建议配置true。
    log.flush = false
    # sync mode: all/full/incr. default is incr.
    # all means full synchronization + incremental synchronization.
    # full means full synchronization only.
    # incr means incremental synchronization only.
    # 同步模式,all表示全量+增量同步,full表示全量同步,incr表示增量同步。
    #sync_mode = incr
    sync_mode = all
    
    # connect source mongodb, set username and password if enable authority. Please note: password shouldn't contain '@'.
    # split by comma(,) if use multiple instance in one replica-set. E.g., mongodb://username1:password1@primaryA,secondaryB,secondaryC
    # split by semicolon(;) if sharding enable. E.g., mongodb://username1:password1@primaryA,secondaryB,secondaryC;mongodb://username2:password2@primaryX,secondaryY,secondaryZ
    # 源MongoDB连接串信息,逗号分隔同一个副本集内的结点,分号分隔分片sharding实例,免密模式
    # 可以忽略“username:password@”,注意,密码里面不能含有'@'符号。
    # 举例:
    # 副本集:mongodb://username1:password1@primaryA,secondaryB,secondaryC
    # 分片集:mongodb://username1:password1@primaryA,secondaryB,secondaryC;mongodb://username2:password2@primaryX,secondaryY,secondaryZ
    mongo_urls = mongodb://igoodful:123456@10.10.10.11:27000,10.10.10.12:27000
    # please fill the source config server url if source mongodb is sharding.
    mongo_cs_url =
    # please give one mongos address if using change stream to fetching data in incremental stage.
    # 如果源端采用change stream拉取,这里还需要配置一个mongos的地址
    mongo_s_url = 
    
    # tunnel pipeline type. now we support rpc,file,kafka,mock,direct
    # 通道模式。
    tunnel = direct
    # tunnel target resource url
    # for rpc. this is remote receiver socket address
    # for tcp. this is remote receiver socket address
    # for file. this is the file path, for instance "data"
    # for kafka. this is the topic and brokers address which split by comma, for
    # instance: topic@brokers1,brokers2, default topic is "mongoshake"
    # for mock. this is uesless
    # for direct. this is target mongodb address which format is the same as `mongo_urls`. If
    # the target is sharding, this should be the mongos address.
    # direct模式用于直接写入MongoDB,其余模式用于一些分析,或者远距离传输场景,
    # 注意,如果是非direct模式,需要通过receiver进行解析,具体参考FAQ文档。
    # 此处配置通道的地址,格式与mongo_urls对齐。
    #tunnel.address = mongodb://user:password@host:port
    tunnel.address = mongodb://igoodful:123456@10.10.10.21:27000
    # the message format in the tunnel, used when tunnel is kafka.
    # "raw": batched raw data format which has good performance but encoded so that users
    # should parse it by receiver.
    # "json": single oplog format by json.
    # "bson": single oplog format by bson.
    # 通道数据的类型,只用于kafka和file通道类型。
    # raw是默认的类型,其采用聚合的模式进行写入和
    # 读取,但是由于携带了一些控制信息,所以需要专门用receiver进行解析。
    # json以json的格式写入kafka,便于用户直接读取。
    # bson以bson二进制的格式写入kafka。
    tunnel.message = raw
    
    # connect mode:
    # primary: fetch data from primary.
    # secondaryPreferred: fetch data from secondary if has, otherwise primary.(default)
    # standalone: fetch data from given 1 node, no matter primary, secondary or hidden. This is only
    # support when tunnel type is direct.
    # 连接模式,primary表示从主上拉取,secondaryPreferred表示优先从secondary拉取(默认建议值),
    # standalone表示从任意单个结点拉取。
    mongo_connect_mode = secondaryPreferred
    
    # filter db or collection namespace. at most one of these two parameters can be given.
    # if the filter.namespace.black is not empty, the given namespace will be
    # filtered while others namespace passed. 
    # if the filter.namespace.white is not empty, the given namespace will be
    # passed while others filtered. 
    # all the namespace will be passed if no condition given.
    # db and collection connected by the dot(.).
    # different namespaces are split by the semicolon(;).
    # filter: filterDbName1.filterCollectionName1;filterDbName2
    # 黑白名单过滤,目前不支持正则,白名单表示通过的namespace,黑名单表示过滤的namespace,
    # 不能同时指定。分号分割不同namespace,每个namespace可以是db,也可以是db.collection。
    #filter.namespace.black =
    filter.namespace.white = mktact.themis_template_field; mktact.themis_template_module;  mktact.goods_allow_list
    # some databases like "admin", "local", "mongoshake", "config", "system.views" are
    # filtered, users can enable these database based on some special needs.
    # different database are split by the semicolon(;).
    # e.g., admin;mongoshake.
    # pay attention: collection isn't support like "admin.xxx" except "system.views"
    # 正常情况下,不建议配置该参数,但对于有些非常特殊的场景,用户可以启用admin,mongoshake等库的同步,
    # 以分号分割,例如:admin;mongoshake。
    filter.pass.special.db = mktact
    # only transfer oplog commands for syncing. represent
    # by oplog.op are "i","d","u".
    # DDL will be transferred if disable like create index, drop databse,
    # transaction in mongodb 4.0.
    # 是否需要开启DDL同步,true表示开启,源是sharding暂时不支持开启。
    # 如果目的端是sharding,暂时不支持applyOps命令,包括事务。
    filter.ddl_enable = true
    
    # checkpoint info, used in resuming from break point.
    # checkpoint存储信息,用于支持断点续传。
    # context.storage.url is used to mark the checkpoint store database. E.g., mongodb://127.0.0.1:20070
    # if not set, checkpoint will be written into source mongodb when source mongodb is replica-set(db=mongoshake),
    # when source mongodb is sharding, the checkpoint will be written into config-server(db=admin)
    # checkpoint的具体写入的MongoDB地址,如果不配置,对于副本集将写入源库(db=mongoshake),对于分片集
    # 将写入config-server(db=admin)
    checkpoint.storage.url =
    # checkpoint db's name.
    # checkpoint存储的db的名字
    checkpoint.storage.db = mongoshake
    # checkpoint collection's name.
    # checkpoint存储的表的名字,如果启动多个mongoshake拉取同一个源可以修改这个表名以防止冲突。
    checkpoint.storage.collection = ckpt_default
    # real checkpoint: the fetching oplog position.
    # pay attention: this is UTC time which is 8 hours latter than CST time. this
    # variable will only be used when checkpoint is not exist.
    # 本次开始拉取的位置,如果checkpoint已经存在(位于上述存储位置)则该参数无效,
    # 如果需要强制该位置开始拉取,需要先删除原来的checkpoint,详见FAQ。
    # 若checkpoint不存在,且该值为1970-01-01T00:00:00Z,则会拉取源端现有的所有oplog。
    # 若checkpoint不存在,且该值不为1970-01-01T00:00:00Z,则会先检查源端oplog最老的时间是否
    # 大于给定的时间,如果是则会直接报错退出。
    checkpoint.start_position = 1970-01-01T00:00:00Z
    
    # transform from source db or collection namespace to dest db or collection namespace.
    # at most one of these two parameters can be given.
    # transform: fromDbName1.fromCollectionName1:toDbName1.toCollectionName1;fromDbName2:toDbName2
    # 转换命名空间,比如a.b同步后变成c.d,谨慎建议开启,比较耗性能。
    #transform.namespace = ucenter.award:award.award;ucenter.award_config:award.award_config;ucenter.award_order:award.award_order;ucenter.address_info:award.address_info
    #transform.namespace = ucenter:award
    #transform.namespace = ucenter.athena_user_task_v2:athena.athena_user_task_v3
    # 数据库名称和集合名称的映射,可能需要变更目的的数据库名称和集合名称 transform.namespace
    = mktact.themis_template_field:bifrost.themis_template_field; mktact.themis_template_module:bifrost.themis_template_module;  mktact.goods_allow_list:bifrost.goods_allow_list # --------------------------- full sync configuration --------------------------- # the number of collection concurrence # 并发最大拉取的表个数,例如,6表示同一时刻shake最多拉取6个表。 full_sync.reader.collection_parallel = 16 # the number of document writer thread in each collection. # 同一个表内并发写的线程数,例如,8表示对于同一个表,将会有8个写线程进行并发写入。 full_sync.reader.write_document_parallel = 32 # number of documents in a batch insert in a document concurrence # 目的端写入的batch大小,例如,128表示一个线程将会一次聚合128个文档然后再写入。 full_sync.reader.document_batch_size = 10240 full_sync.reader.read_document_count=0 # drop the same name of collection in dest mongodb in full synchronization # 同步时如果目的库存在,是否先删除目的库再进行同步。 full_sync.collection_exist_no_drop = true # create foreground indexes when data sync finish in full sync stage. # 全量期间数据同步完毕后,是否需要创建索引,none表示不创建,foreground表示创建前台索引, # background表示创建后台索引。 full_sync.create_index = background # convert insert to update when duplicate key found # 如果_id存在在目的库,是否将insert语句修改为update语句。 full_sync.executor.insert_on_dup_update = true # filter orphan document for source type is sharding. # 源端是sharding,是否需要过滤orphan文档 full_sync.executor.filter.orphan_document = false # enable majority write in full sync. # the performance will degrade if enable. # 全量阶段写入端是否启用majority write full_sync.executor.majority_enable = false # --------------------------- incrmental sync configuration --------------------------- # fetch method: # oplog: fetch oplog from source mongodb (default) # change_stream: use change to receive change event from source mongodb, support MongoDB >= 4.0 incr_sync.mongo_fetch_method = oplog # global id. used in active-active replication. # this parameter is not supported on current open-source version. # gid用于双活防止环形复制,目前只用于阿里云云上MongoDB,如果是阿里云云上实例互相同步 # 希望开启gid,请联系阿里云售后或者烛昭(vinllen),sharding的有多个gid请以分号(;)分隔。 incr_sync.oplog.gids = # distribute data to different worker by hash key to run in parallel. # rker = 8 # [auto] decide by if there has unique index in collections. # use `collection` if has unique index otherwise use `id`. # [id] shard by ObjectId. handle oplogs in sequence by unique _id # [collection] shard by ns. handle oplogs in sequence by unique ns # hash的方式,id表示按文档hash,collection表示按表hash,auto表示自动选择hash类型。 # 如果没有索引建议选择id达到非常高的同步性能,反之请选择collection。 incr_sync.shard_key = auto # oplog transmit worker concurrent # if the source is sharding, worker number must equal to shard numbers. # 内部发送的worker数目,如果机器性能足够,可以提高worker个数。 incr_sync.worker = 128 # batched oplogs have block level checksum value using # crc32 algorithm. and compressor for compressing content # of oplog entry. # supported compressor are : gzip,zlib,deflate # Do not enable this option when tunnel type is "direct" # 是否启用发送,非direct模式发送可以选择压缩以减少网络带宽消耗。 incr_sync.worker.oplog_compressor = none # memory queue configuration, plz visit FAQ document to see more details. # do not modify these variables if the performance and resource usage can # meet your needs. # 内部队列的配置参数,如果目前性能足够不建议修改,详细信息参考FAQ。 incr_sync.worker.batch_queue_size = 64 incr_sync.adaptive.batching_max_size = 1024 incr_sync.fetcher.buffer_capacity = 256 # --- direct tunnel only begin --- # if tunnel type is direct, all the below variable should be set # 下列参数仅用于tunnel为direct的情况。 # oplog changes to Insert while Update found non-exist (_id or unique-index) # 如果_id不存在在目的库,是否将update语句修改为insert语句。 incr_sync.executor.upsert = false # oplog changes to Update while Insert found duplicated key (_id or unique-index) # 如果_id存在在目的库,是否将insert语句修改为update语句。 incr_sync.executor.insert_on_dup_update = false # db. write duplicated logs to mongoshake_conflict # sdk. write duplicated logs to sdk. # 如果写入存在冲突,记录冲突的文档。 incr_sync.conflict_write_to = none # enable majority write in incrmental sync. # the performance will degrade if enable. # 增量阶段写入端是否启用majority write incr_sync.executor.majority_enable = false # --- direct tunnel only end ---

    ###################

     

    ##################

    igoodful@qq.com
  • 相关阅读:
    bzoj2049 [Sdoi2008]Cave 洞穴勘测——LCT
    洛谷P2679 子串——DP
    bzoj3669 [Noi2014]魔法森林——LCT
    洛谷P3778 [APIO2017]商旅——01分数规划
    bzoj4196 [Noi2015]软件包管理器——树链剖分
    bzoj4881 线段游戏——上升序列方案数
    bzoj1426 (洛谷P4550) 收集邮票——期望
    bzoj1858 [Scoi2010]序列操作——线段树
    bzoj3626 [LNOI2014]LCA——树链剖分
    L The Digits String(没有写完,有空补)
  • 原文地址:https://www.cnblogs.com/igoodful/p/15629664.html
Copyright © 2011-2022 走看看