-
Notifications
You must be signed in to change notification settings - Fork 936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flink][cdc] Add support for retry cnt instead of busy wait and additional support to skip corrupt records in cdc writer #4295
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,15 +48,33 @@ public class CdcRecordStoreWriteOperator extends TableWriteOperator<CdcRecord> { | |
.durationType() | ||
.defaultValue(Duration.ofMillis(500)); | ||
|
||
public static final ConfigOption<Integer> MAX_RETRY_NUM_TIMES = | ||
ConfigOptions.key("cdc.max-retry-num-times") | ||
.intType() | ||
.defaultValue(100) | ||
.withDescription("Max retry count for updating table before failing loudly"); | ||
|
||
public static final ConfigOption<Boolean> SKIP_CORRUPT_RECORD = | ||
ConfigOptions.key("cdc.skip-corrupt-record") | ||
.booleanType() | ||
.defaultValue(false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whether this is true or false, it will change the current default behaviour of waiting indefinitely for a schema change. so we could either add an option to preserve the current behaviour as default and make retries optional. or make sure patch notes contain an info about the breaking change that paimon will no longer wait indefinitely for schema updates but rather retry and optionally skip unreadable rows. |
||
.withDescription("Skip corrupt record if we fail to parse it"); | ||
|
||
private final long retrySleepMillis; | ||
|
||
private final int maxRetryNumTimes; | ||
|
||
private final boolean skipCorruptRecord; | ||
|
||
public CdcRecordStoreWriteOperator( | ||
FileStoreTable table, | ||
StoreSinkWrite.Provider storeSinkWriteProvider, | ||
String initialCommitUser) { | ||
super(table, storeSinkWriteProvider, initialCommitUser); | ||
this.retrySleepMillis = | ||
table.coreOptions().toConfiguration().get(RETRY_SLEEP_TIME).toMillis(); | ||
this.maxRetryNumTimes = table.coreOptions().toConfiguration().get(MAX_RETRY_NUM_TIMES); | ||
this.skipCorruptRecord = table.coreOptions().toConfiguration().get(SKIP_CORRUPT_RECORD); | ||
} | ||
|
||
@Override | ||
|
@@ -75,7 +93,7 @@ public void processElement(StreamRecord<CdcRecord> element) throws Exception { | |
CdcRecord record = element.getValue(); | ||
Optional<GenericRow> optionalConverted = toGenericRow(record, table.schema().fields()); | ||
if (!optionalConverted.isPresent()) { | ||
while (true) { | ||
for (int retry = 0; retry < maxRetryNumTimes; ++retry) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: to me it's slightly unusual to increment the retry counter before entering the block. but i guess it does the same thing as |
||
table = table.copyWithLatestSchema(); | ||
optionalConverted = toGenericRow(record, table.schema().fields()); | ||
if (optionalConverted.isPresent()) { | ||
|
@@ -86,10 +104,18 @@ public void processElement(StreamRecord<CdcRecord> element) throws Exception { | |
write.replace(table); | ||
} | ||
|
||
try { | ||
write.write(optionalConverted.get()); | ||
} catch (Exception e) { | ||
throw new IOException(e); | ||
if (!optionalConverted.isPresent()) { | ||
if (skipCorruptRecord) { | ||
LOG.warn("Skipping corrupt or unparsable record {}", record); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just a note that this might leak sensitive data to the log system. maybe we can log some metadata about the record instead of the full record? |
||
} else { | ||
throw new RuntimeException("Unable to process element. Possibly a corrupt record"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we include some info about the record in the exception too? |
||
} | ||
} else { | ||
try { | ||
write.write(optionalConverted.get()); | ||
} catch (Exception e) { | ||
throw new IOException(e); | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we prefix the new options with
retry-
similar to the already existing onecdc.retry-sleep-time
?e.g.
cdc.retry-count
andcdc.retry-skip-corrupt-record
?