AWS DMS Tasks fails after few days stating storage used by swap files exceeded the limit for task

1k Views Asked by At

I am running 8-10 ongoing replication tasks in AWS DMS, with multiple tables inside each task. But one task in specific gets fails ~twice per week. It has a highly transactional table in which ~1 mil updates happen per 10 mins.

Error-

Reading from source endpoint temporary paused as total storage used by swap files exceeded the limit for task: abc-xyz

Info in Cloudwatch -

Reading from source is paused. Total storage used by swap files exceeded the limit 1048576000 bytes

All premigration assessments passed.

All the tables have a primary key and no LOB or CLOB cols in the task which gets failed. Details of the task:

  • Source -Oracle
  • Target - Redshift
  • Replication instance - dms.c5.2xlarge, single AZ
  • Avg CPU utilization - ~35 %
  • Using LOG miner technique.

Task Setting JSON:

{
    "Logging": {
        "EnableLogging": true,
        "LogComponents": [
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "TRANSFORMATION"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "SOURCE_UNLOAD"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "IO"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "TARGET_LOAD"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "PERFORMANCE"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "SOURCE_CAPTURE"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "SORTER"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "REST_SERVER"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "VALIDATOR_EXT"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "TARGET_APPLY"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "TASK_MANAGER"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "TABLES_MANAGER"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "METADATA_MANAGER"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "FILE_FACTORY"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "COMMON"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "ADDONS"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "DATA_STRUCTURE"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "COMMUNICATION"
            },
            {
                "Severity": "LOGGER_SEVERITY_DEFAULT",
                "Id": "FILE_TRANSFER"
            }
        ],
        "CloudWatchLogGroup": "dms-tasks-rep-instance-poc-3",
        "CloudWatchLogStream": "dms-task-abc-ongoing-task"
    },
    "StreamBufferSettings": {
        "StreamBufferCount": 3,
        "CtrlStreamBufferSizeInMB": 5,
        "StreamBufferSizeInMB": 8
    },
    "ErrorBehavior": {
        "FailOnNoTablesCaptured": false,
        "ApplyErrorUpdatePolicy": "LOG_ERROR",
        "FailOnTransactionConsistencyBreached": false,
        "RecoverableErrorThrottlingMax": 1800,
        "DataErrorEscalationPolicy": "SUSPEND_TABLE",
        "ApplyErrorEscalationCount": 0,
        "RecoverableErrorStopRetryAfterThrottlingMax": false,
        "RecoverableErrorThrottling": true,
        "ApplyErrorFailOnTruncationDdl": false,
        "DataTruncationErrorPolicy": "LOG_ERROR",
        "ApplyErrorInsertPolicy": "LOG_ERROR",
        "ApplyErrorEscalationPolicy": "LOG_ERROR",
        "RecoverableErrorCount": -1,
        "DataErrorEscalationCount": 0,
        "TableErrorEscalationPolicy": "STOP_TASK",
        "RecoverableErrorInterval": 5,
        "ApplyErrorDeletePolicy": "IGNORE_RECORD",
        "TableErrorEscalationCount": 0,
        "FullLoadIgnoreConflicts": true,
        "DataErrorPolicy": "LOG_ERROR",
        "TableErrorPolicy": "SUSPEND_TABLE"
    },
    "TTSettings": {
        "TTS3Settings": null,
        "TTRecordSettings": null,
        "EnableTT": false
    },
    "FullLoadSettings": {
        "CommitRate": 10000,
        "StopTaskCachedChangesApplied": false,
        "StopTaskCachedChangesNotApplied": false,
        "MaxFullLoadSubTasks": 8,
        "TransactionConsistencyTimeout": 600,
        "CreatePkAfterFullLoad": false,
        "TargetTablePrepMode": "DO_NOTHING"
    },
    "TargetMetadata": {
        "ParallelApplyBufferSize": 500,
        "ParallelApplyQueuesPerThread": 0,
        "ParallelApplyThreads": 32,
        "TargetSchema": "",
        "InlineLobMaxSize": 0,
        "ParallelLoadQueuesPerThread": 0,
        "SupportLobs": true,
        "LobChunkSize": 0,
        "TaskRecoveryTableEnabled": false,
        "ParallelLoadThreads": 8,
        "LobMaxSize": 32,
        "BatchApplyEnabled": true,
        "FullLobMode": false,
        "LimitedSizeLobMode": true,
        "LoadMaxFileSize": 0,
        "ParallelLoadBufferSize": 500
    },
    "BeforeImageSettings": null,
    "ControlTablesSettings": {
        "historyTimeslotInMinutes": 5,
        "HistoryTimeslotInMinutes": 5,
        "StatusTableEnabled": false,
        "SuspendedTablesTableEnabled": false,
        "HistoryTableEnabled": false,
        "ControlSchema": "",
        "FullLoadExceptionTableEnabled": false
    },
    "LoopbackPreventionSettings": null,
    "CharacterSetSettings": null,
    "FailTaskWhenCleanTaskResourceFailed": false,
    "ChangeProcessingTuning": {
        "StatementCacheSize": 50,
        "CommitTimeout": 1,
        "BatchApplyPreserveTransaction": true,
        "BatchApplyTimeoutMin": 1,
        "BatchSplitSize": 0,
        "BatchApplyTimeoutMax": 30,
        "MinTransactionSize": 1000,
        "MemoryKeepTime": 60,
        "BatchApplyMemoryLimit": 500,
        "MemoryLimitTotal": 1024
    },
    "ChangeProcessingDdlHandlingPolicy": {
        "HandleSourceTableDropped": true,
        "HandleSourceTableTruncated": true,
        "HandleSourceTableAltered": true
    },
    "PostProcessingRules": null
}

Do it happen due to a single AZ instead of multi-AZ CPU or are there any other primary reason?

0

There are 0 best solutions below