From ed2d36852565903bb628ff042f0e9ea746dcf8f0 Mon Sep 17 00:00:00 2001 From: <> Date: Sat, 20 Jul 2024 10:08:57 +0000 Subject: [PATCH] Deployed 45f383b with MkDocs version: 1.6.0 --- .nojekyll | 0 404.html | 880 + api/feature_elimination.html | 5744 + api/model_interpret.html | 5461 + api/sample_similarity.html | 5263 + api/utils.html | 1631 + assets/_mkdocstrings.css | 119 + assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.fe8b6f2b.min.js | 29 + assets/javascripts/bundle.fe8b6f2b.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 + .../workers/search.b8dbb3d2.min.js | 42 + .../workers/search.b8dbb3d2.min.js.map | 7 + assets/stylesheets/main.76a95c52.min.css | 1 + assets/stylesheets/main.76a95c52.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + discussion/nb_rfecv_vs_shaprfecv.html | 16157 ++ howto/grouped_data.html | 1726 + howto/reproducibility.html | 2002 + img/Probatus_P.png | Bin 0 -> 21709 bytes img/Probatus_P_white.png | Bin 0 -> 11981 bytes img/earlystoppingshaprfecv.png | Bin 0 -> 102139 bytes img/logo_large.png | Bin 0 -> 48806 bytes img/logo_large_white.png | Bin 0 -> 15443 bytes img/model_interpret_dep.png | Bin 0 -> 33014 bytes img/model_interpret_importance.png | Bin 0 -> 16297 bytes img/model_interpret_sample.png | Bin 0 -> 40708 bytes img/model_interpret_summary.png | Bin 0 -> 27475 bytes img/resemblance_model_schema.png | Bin 0 -> 151395 bytes ...mple_similarity_permutation_importance.png | Bin 0 -> 18802 bytes img/sample_similarity_shap_importance.png | Bin 0 -> 15849 bytes img/sample_similarity_shap_summary.png | Bin 0 -> 32657 bytes img/shaprfecv.png | Bin 0 -> 27532 bytes index.html | 1005 + objects.inv | Bin 0 -> 714 bytes search/search_index.json | 1 + sitemap.xml | 78 + sitemap.xml.gz | Bin 0 -> 419 bytes tutorials/nb_automatic_best_num_features.html | 2108 + tutorials/nb_custom_scoring.html | 1747 + tutorials/nb_sample_similarity.html | 1962 + tutorials/nb_shap_dependence.html | 1983 + tutorials/nb_shap_feature_elimination.html | 194166 +++++++++++++++ tutorials/nb_shap_model_interpreter.html | 2227 + ...riance_penalty_and_results_comparison.html | 2674 + 79 files changed, 254224 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 api/feature_elimination.html create mode 100644 api/model_interpret.html create mode 100644 api/sample_similarity.html create mode 100644 api/utils.html create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js.map create mode 100644 assets/stylesheets/main.76a95c52.min.css create mode 100644 assets/stylesheets/main.76a95c52.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 discussion/nb_rfecv_vs_shaprfecv.html create mode 100644 howto/grouped_data.html create mode 100644 howto/reproducibility.html create mode 100644 img/Probatus_P.png create mode 100644 img/Probatus_P_white.png create mode 100644 img/earlystoppingshaprfecv.png create mode 100644 img/logo_large.png create mode 100644 img/logo_large_white.png create mode 100644 img/model_interpret_dep.png create mode 100644 img/model_interpret_importance.png create mode 100644 img/model_interpret_sample.png create mode 100644 img/model_interpret_summary.png create mode 100644 img/resemblance_model_schema.png create mode 100644 img/sample_similarity_permutation_importance.png create mode 100644 img/sample_similarity_shap_importance.png create mode 100644 img/sample_similarity_shap_summary.png create mode 100644 img/shaprfecv.png create mode 100644 index.html create mode 100644 objects.inv create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz create mode 100644 tutorials/nb_automatic_best_num_features.html create mode 100644 tutorials/nb_custom_scoring.html create mode 100644 tutorials/nb_sample_similarity.html create mode 100644 tutorials/nb_shap_dependence.html create mode 100644 tutorials/nb_shap_feature_elimination.html create mode 100644 tutorials/nb_shap_model_interpreter.html create mode 100644 tutorials/nb_shap_variance_penalty_and_results_comparison.html diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..99f601f1 --- /dev/null +++ b/404.html @@ -0,0 +1,880 @@ + + + +
+ + + + + + + + + + + + + + + + + + +This module focuses on feature elimination and it contains two classes:
+ShapRFECV
+
+
+
+ Bases: BaseFitComputePlotClass
This class performs Backwards Recursive Feature Elimination, using SHAP feature importance.
+At each round, for a + given feature set, starting from all available features, the following steps are applied:
+step
lowest SHAP importance features from the dataset.At the end of the process, the user can plot the performance of the model for each iteration, and select the + optimal number of features and the features set.
+The functionality is
+ similar to RFECV.
+ The main difference is removing the lowest importance features based on SHAP features importance. It also
+ supports the use of sklearn compatible search CV for hyperparameter optimization e.g.
+ GridSearchCV,
+ RandomizedSearchCV, or
+ BayesSearchCV, which
+ needs to be passed as the model
. Thanks to this you can perform hyperparameter optimization at each step of
+ the feature elimination. Lastly, it supports categorical features (object and category dtype) and missing values
+ in the data, as long as the model supports them.
We recommend using LGBMClassifier, + because by default it handles missing values and categorical features. In case of other models, make sure to + handle these issues for your dataset and consider impact it might have on features importance.
+Example:
+import numpy as np
+import pandas as pd
+from probatus.feature_elimination import ShapRFECV
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import RandomizedSearchCV
+
+feature_names = [
+ 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7',
+ 'f8', 'f9', 'f10', 'f11', 'f12', 'f13',
+ 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20']
+
+# Prepare two samples
+X, y = make_classification(n_samples=200, class_sep=0.05, n_informative=6, n_features=20,
+ random_state=0, n_redundant=10, n_clusters_per_class=1)
+X = pd.DataFrame(X, columns=feature_names)
+
+
+# Prepare model and parameter search space
+model = RandomForestClassifier(max_depth=5, class_weight='balanced')
+
+param_grid = {
+ 'n_estimators': [5, 7, 10],
+ 'min_samples_leaf': [3, 5, 7, 10],
+}
+search = RandomizedSearchCV(model, param_grid)
+
+
+# Run feature elimination
+shap_elimination = ShapRFECV(
+ model=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
+report = shap_elimination.fit_compute(X, y)
+
+# Make plots
+performance_plot = shap_elimination.plot()
+
+# Get final feature set
+final_features_set = shap_elimination.get_reduced_features_set(num_features=3)
+
+
+
+ probatus/feature_elimination/feature_elimination.py
23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 + 100 + 101 + 102 + 103 + 104 + 105 + 106 + 107 + 108 + 109 + 110 + 111 + 112 + 113 + 114 + 115 + 116 + 117 + 118 + 119 + 120 + 121 + 122 + 123 + 124 + 125 + 126 + 127 + 128 + 129 + 130 + 131 + 132 + 133 + 134 + 135 + 136 + 137 + 138 + 139 + 140 + 141 + 142 + 143 + 144 + 145 + 146 + 147 + 148 + 149 + 150 + 151 + 152 + 153 + 154 + 155 + 156 + 157 + 158 + 159 + 160 + 161 + 162 + 163 + 164 + 165 + 166 + 167 + 168 + 169 + 170 + 171 + 172 + 173 + 174 + 175 + 176 + 177 + 178 + 179 + 180 + 181 + 182 + 183 + 184 + 185 + 186 + 187 + 188 + 189 + 190 + 191 + 192 + 193 + 194 + 195 + 196 + 197 + 198 + 199 + 200 + 201 + 202 + 203 + 204 + 205 + 206 + 207 + 208 + 209 + 210 + 211 + 212 + 213 + 214 + 215 + 216 + 217 + 218 + 219 + 220 + 221 + 222 + 223 + 224 + 225 + 226 + 227 + 228 + 229 + 230 + 231 + 232 + 233 + 234 + 235 + 236 + 237 + 238 + 239 + 240 + 241 + 242 + 243 + 244 + 245 + 246 + 247 + 248 + 249 + 250 + 251 + 252 + 253 + 254 + 255 + 256 + 257 + 258 + 259 + 260 + 261 + 262 + 263 + 264 + 265 + 266 + 267 + 268 + 269 + 270 + 271 + 272 + 273 + 274 + 275 + 276 + 277 + 278 + 279 + 280 + 281 + 282 + 283 + 284 + 285 + 286 + 287 + 288 + 289 + 290 + 291 + 292 + 293 + 294 + 295 + 296 + 297 + 298 + 299 + 300 + 301 + 302 + 303 + 304 + 305 + 306 + 307 + 308 + 309 + 310 + 311 + 312 + 313 + 314 + 315 + 316 + 317 + 318 + 319 + 320 + 321 + 322 + 323 + 324 + 325 + 326 + 327 + 328 + 329 + 330 + 331 + 332 + 333 + 334 + 335 + 336 + 337 + 338 + 339 + 340 + 341 + 342 + 343 + 344 + 345 + 346 + 347 + 348 + 349 + 350 + 351 + 352 + 353 + 354 + 355 + 356 + 357 + 358 + 359 + 360 + 361 + 362 + 363 + 364 + 365 + 366 + 367 + 368 + 369 + 370 + 371 + 372 + 373 + 374 + 375 + 376 + 377 + 378 + 379 + 380 + 381 + 382 + 383 + 384 + 385 + 386 + 387 + 388 + 389 + 390 + 391 + 392 + 393 + 394 + 395 + 396 + 397 + 398 + 399 + 400 + 401 + 402 + 403 + 404 + 405 + 406 + 407 + 408 + 409 + 410 + 411 + 412 + 413 + 414 + 415 + 416 + 417 + 418 + 419 + 420 + 421 + 422 + 423 + 424 + 425 + 426 + 427 + 428 + 429 + 430 + 431 + 432 + 433 + 434 + 435 + 436 + 437 + 438 + 439 + 440 + 441 + 442 + 443 + 444 + 445 + 446 + 447 + 448 + 449 + 450 + 451 + 452 + 453 + 454 + 455 + 456 + 457 + 458 + 459 + 460 + 461 + 462 + 463 + 464 + 465 + 466 + 467 + 468 + 469 + 470 + 471 + 472 + 473 + 474 + 475 + 476 + 477 + 478 + 479 + 480 + 481 + 482 + 483 + 484 + 485 + 486 + 487 + 488 + 489 + 490 + 491 + 492 + 493 + 494 + 495 + 496 + 497 + 498 + 499 + 500 + 501 + 502 + 503 + 504 + 505 + 506 + 507 + 508 + 509 + 510 + 511 + 512 + 513 + 514 + 515 + 516 + 517 + 518 + 519 + 520 + 521 + 522 + 523 + 524 + 525 + 526 + 527 + 528 + 529 + 530 + 531 + 532 + 533 + 534 + 535 + 536 + 537 + 538 + 539 + 540 + 541 + 542 + 543 + 544 + 545 + 546 + 547 + 548 + 549 + 550 + 551 + 552 + 553 + 554 + 555 + 556 + 557 + 558 + 559 + 560 + 561 + 562 + 563 + 564 + 565 + 566 + 567 + 568 + 569 + 570 + 571 + 572 + 573 + 574 + 575 + 576 + 577 + 578 + 579 + 580 + 581 + 582 + 583 + 584 + 585 + 586 + 587 + 588 + 589 + 590 + 591 + 592 + 593 + 594 + 595 + 596 + 597 + 598 + 599 + 600 + 601 + 602 + 603 + 604 + 605 + 606 + 607 + 608 + 609 + 610 + 611 + 612 + 613 + 614 + 615 + 616 + 617 + 618 + 619 + 620 + 621 + 622 + 623 + 624 + 625 + 626 + 627 + 628 + 629 + 630 + 631 + 632 + 633 + 634 + 635 + 636 + 637 + 638 + 639 + 640 + 641 + 642 + 643 + 644 + 645 + 646 + 647 + 648 + 649 + 650 + 651 + 652 + 653 + 654 + 655 + 656 + 657 + 658 + 659 + 660 + 661 + 662 + 663 + 664 + 665 + 666 + 667 + 668 + 669 + 670 + 671 + 672 + 673 + 674 + 675 + 676 + 677 + 678 + 679 + 680 + 681 + 682 + 683 + 684 + 685 + 686 + 687 + 688 + 689 + 690 + 691 + 692 + 693 + 694 + 695 + 696 + 697 + 698 + 699 + 700 + 701 + 702 + 703 + 704 + 705 + 706 + 707 + 708 + 709 + 710 + 711 + 712 + 713 + 714 + 715 + 716 + 717 + 718 + 719 + 720 + 721 + 722 + 723 + 724 + 725 + 726 + 727 + 728 + 729 + 730 + 731 + 732 + 733 + 734 + 735 + 736 + 737 + 738 + 739 + 740 + 741 + 742 + 743 + 744 + 745 + 746 + 747 + 748 + 749 + 750 + 751 + 752 + 753 + 754 + 755 + 756 + 757 + 758 + 759 + 760 + 761 + 762 + 763 + 764 + 765 + 766 + 767 + 768 + 769 + 770 + 771 + 772 + 773 + 774 + 775 + 776 + 777 + 778 + 779 + 780 + 781 + 782 + 783 + 784 + 785 + 786 + 787 + 788 + 789 + 790 + 791 + 792 + 793 + 794 + 795 + 796 + 797 + 798 + 799 + 800 + 801 + 802 + 803 + 804 + 805 + 806 + 807 + 808 + 809 + 810 + 811 + 812 + 813 + 814 + 815 + 816 + 817 + 818 + 819 + 820 + 821 + 822 + 823 + 824 + 825 + 826 + 827 + 828 + 829 + 830 + 831 + 832 + 833 + 834 + 835 + 836 + 837 + 838 + 839 + 840 + 841 + 842 + 843 + 844 + 845 + 846 + 847 + 848 + 849 + 850 + 851 + 852 + 853 + 854 + 855 + 856 + 857 + 858 + 859 + 860 + 861 + 862 + 863 + 864 + 865 + 866 + 867 + 868 + 869 + 870 + 871 + 872 + 873 + 874 + 875 + 876 + 877 + 878 + 879 + 880 + 881 + 882 + 883 + 884 + 885 + 886 + 887 + 888 + 889 + 890 + 891 + 892 + 893 + 894 + 895 + 896 + 897 + 898 + 899 + 900 + 901 + 902 + 903 + 904 + 905 + 906 + 907 + 908 + 909 + 910 + 911 + 912 + 913 + 914 + 915 + 916 + 917 + 918 + 919 + 920 + 921 + 922 + 923 + 924 + 925 + 926 + 927 + 928 + 929 + 930 + 931 + 932 + 933 + 934 + 935 + 936 + 937 + 938 + 939 + 940 + 941 + 942 + 943 + 944 + 945 + 946 + 947 + 948 + 949 + 950 + 951 + 952 + 953 + 954 + 955 + 956 + 957 + 958 + 959 + 960 + 961 + 962 + 963 + 964 + 965 + 966 + 967 + 968 + 969 + 970 + 971 + 972 + 973 + 974 + 975 + 976 + 977 + 978 + 979 + 980 + 981 + 982 + 983 + 984 + 985 + 986 + 987 + 988 + 989 + 990 + 991 + 992 + 993 + 994 + 995 + 996 + 997 + 998 + 999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +1229 +1230 +1231 +1232 +1233 +1234 +1235 +1236 +1237 +1238 +1239 +1240 +1241 +1242 +1243 +1244 +1245 +1246 +1247 +1248 +1249 +1250 +1251 +1252 +1253 +1254 +1255 +1256 +1257 +1258 +1259 +1260 +1261 +1262 +1263 +1264 +1265 +1266 +1267 +1268 +1269 +1270 +1271 +1272 +1273 +1274 +1275 +1276 +1277 +1278 +1279 +1280 +1281 +1282 +1283 +1284 +1285 +1286 +1287 +1288 +1289 +1290 +1291 +1292 +1293 +1294 +1295 +1296 +1297 +1298 +1299 +1300 +1301 +1302 +1303 +1304 +1305 +1306 +1307 +1308 +1309 +1310 +1311 +1312 +1313 +1314 +1315 +1316 +1317 +1318 +1319 +1320 +1321 +1322 +1323 +1324 +1325 +1326 +1327 +1328 +1329 +1330 +1331 +1332 +1333 +1334 +1335 +1336 +1337 +1338 +1339 +1340 +1341 +1342 +1343 +1344 +1345 +1346 +1347 +1348 +1349 +1350 +1351 |
|
__init__(model, step=1, min_features_to_select=1, cv=None, scoring='roc_auc', n_jobs=-1, verbose=0, random_state=None, early_stopping_rounds=None, eval_metric=None)
+
+This method initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ classifier or regressor, sklearn compatible search CV e.g. GridSearchCV, RandomizedSearchCV or BayesSearchCV
+ |
+
+
+
+ A model that will be optimized and trained at each round of feature elimination. The recommended model +is LGBMClassifier, +because it by default handles the missing values and categorical variables. This parameter also supports +any hyperparameter search schema that is consistent with the sklearn API e.g. +GridSearchCV, +RandomizedSearchCV +or BayesSearchCV. + |
+ + required + | +
step |
+
+ int or float
+ |
+
+
+
+ Number of lowest importance features removed each round. If it is an int, then each round such a number of +features are discarded. If float, such a percentage of remaining features (rounded down) is removed each +iteration. It is recommended to use float, since it is faster for a large number of features, and slows +down and becomes more precise with fewer features. Note: the last round may remove fewer features in +order to reach min_features_to_select. +If columns_to_keep parameter is specified in the fit method, step is the number of features to remove after +keeping those columns. + |
+
+ 1
+ |
+
min_features_to_select |
+
+ int
+ |
+
+
+
+ Minimum number of features to be kept. This is a stopping criterion of the feature elimination. By +default the process stops when one feature is left. If columns_to_keep is specified in the fit method, +it may override this parameter to the maximum between length of columns_to_keep the two. + |
+
+ 1
+ |
+
cv |
+
+ int, cross-validation generator or an iterable
+ |
+
+
+
+ Determines the cross-validation splitting strategy. Compatible with sklearn +cv parameter. +If None, then cv of 5 is used. + |
+
+ None
+ |
+
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with predefined +classification scorers names in sklearn. + |
+
+ 'roc_auc'
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of cores to run in parallel while fitting across folds. None means 1 unless in a
+ |
+
+ -1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
early_stopping_rounds |
+
+ int
+ |
+
+
+
+ Number of rounds with constant performance after which the model fitting stops. This is passed to the +fit method of the model for Shapley values estimation, but not for hyperparameter search. Only +supported by some models, such as XGBoost, LightGBM and CatBoost. Only recommended when dealing with large sets of data. + |
+
+ None
+ |
+
eval_metric |
+
+ str
+ |
+
+
+
+ Metric for scoring fitting rounds and activating early stopping. This is passed to the
+fit method of the model for Shapley values estimation, but not for hyperparameter search. Only
+supported by some models, such as XGBoost
+and LightGBM.
+Note that |
+
+ None
+ |
+
probatus/feature_elimination/feature_elimination.py
104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 |
|
compute()
+
+Checks if fit() method has been run.
+and computes the DataFrame with results of feature elimination for each round.
+ + +Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ DataFrame with results of feature elimination for each round. + |
+
probatus/feature_elimination/feature_elimination.py
234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 |
|
fit(X, y, sample_weight=None, columns_to_keep=None, column_names=None, groups=None, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object with the provided data.
+The algorithm starts with the entire dataset, and then sequentially
+ eliminates features. If sklearn compatible search CV is passed as model e.g.
+ GridSearchCV,
+ RandomizedSearchCV
+ or BayesSearchCV,
+ the hyperparameter optimization is applied at each step of the elimination.
+ Then, the SHAP feature importance is calculated using Cross-Validation,
+ and step
lowest importance features are removed.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
sample_weight |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,) - only use if the model you're using supports +sample weighting (check the corresponding scikit-learn documentation). +Array of weights that are assigned to individual samples. +Note that they're only used for fitting of the model, not during evaluation of metrics. +If not provided, then each sample is given unit weight. + |
+
+ None
+ |
+
columns_to_keep |
+
+ list of str
+ |
+
+
+
+ List of column names to keep. If given, +these columns will not be eliminated by the feature elimination process. +However, these feature will used for the calculation of the SHAP values. + |
+
+ None
+ |
+
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
groups |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,)
+Group labels for the samples used while splitting the dataset into train/test set.
+Only used in conjunction with a "Group" |
+
+ None
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ ShapRFECV
+ |
+
+
+
+ Fitted object. + |
+
probatus/feature_elimination/feature_elimination.py
322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 |
|
fit_compute(X, y, sample_weight=None, columns_to_keep=None, column_names=None, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object with the provided data.
+The algorithm starts with the entire dataset, and then sequentially
+ eliminates features. If sklearn compatible search CV is passed as model e.g.
+ GridSearchCV,
+ RandomizedSearchCV
+ or BayesSearchCV,
+ the hyperparameter optimization is applied at each step of the elimination.
+ Then, the SHAP feature importance is calculated using Cross-Validation,
+ and step
lowest importance features are removed. At the end, the
+ report containing results from each iteration is computed and returned to the user.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
sample_weight |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,) - only use if the model you're using supports +sample weighting (check the corresponding scikit-learn documentation). +Array of weights that are assigned to individual samples. +Note that they're only used for fitting of the model, not during evaluation of metrics. +If not provided, then each sample is given unit weight. + |
+
+ None
+ |
+
columns_to_keep |
+
+ list of str
+ |
+
+
+
+ List of columns to keep. If given, these columns will not be eliminated. + |
+
+ None
+ |
+
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ DataFrame containing results of feature elimination from each iteration. + |
+
probatus/feature_elimination/feature_elimination.py
248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 |
|
get_reduced_features_set(num_features, standard_error_threshold=1.0, return_type='feature_names')
+
+Gets the features set after the feature elimination process, for a given number of features.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
num_features |
+
+ int or str
+ |
+
+
+
+ If int: Number of features in the reduced features set. +If str: One of the following automatic num feature selection methods supported: + 1. best: strictly selects the num_features with the highest model score. + 2. best_coherent: For iterations that are within standard_error_threshold of the highest + score, select the iteration with the lowest standard deviation of model score. + 3. best_parsimonious: For iterations that are within standard_error_threshold of the + highest score, select the iteration with the fewest features. + |
+ + required + | +
standard_error_threshold |
+
+ float
+ |
+
+
+
+ If num_features is 'best_coherent' or 'best_parsimonious', this parameter is used. + |
+
+ 1.0
+ |
+
return_type |
+ + | +
+
+
+ Accepts possible values of 'feature_names', 'support' or 'ranking'. These are defined as: + 1. feature_names: returns column names + 2. support: returns boolean mask + 3. ranking: returns numeric ranking of features + |
+
+ 'feature_names'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list of str
+ |
+
+
+
+ Reduced features set. + |
+
probatus/feature_elimination/feature_elimination.py
829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 |
|
plot(show=True, **figure_kwargs)
+
+Generates plot of the model performance for each iteration of feature elimination.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned figure, before showing it. + |
+
+ True
+ |
+
**figure_kwargs |
+ + | +
+
+
+ Keyword arguments that are passed to the plt.figure, at its initialization. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ figure
+ |
+
+
+
+ Figure containing the performance plot. + |
+
probatus/feature_elimination/feature_elimination.py
531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 |
|
The aim of this module is to provide tools for model interpretation using the SHAP library. +The class below is a convenience wrapper that implements multiple plots for tree-based & linear models.
+ + +ShapModelInterpreter
+
+
+
+ Bases: BaseFitComputePlotClass
This class is a wrapper that allows to easily analyse a model's features.
+It allows us to plot SHAP feature importance, + SHAP summary plot and SHAP dependence plots.
+Example:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from probatus.interpret import ShapModelInterpreter
+import numpy as np
+import pandas as pd
+
+feature_names = ['f1', 'f2', 'f3', 'f4']
+
+# Prepare two samples
+X, y = make_classification(n_samples=5000, n_features=4, random_state=0)
+X = pd.DataFrame(X, columns=feature_names)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Prepare and fit model. Remember about class_weight="balanced" or an equivalent.
+model = RandomForestClassifier(class_weight='balanced', n_estimators = 100, max_depth=2, random_state=0)
+model.fit(X_train, y_train)
+
+# Train ShapModelInterpreter
+shap_interpreter = ShapModelInterpreter(model)
+feature_importance = shap_interpreter.fit_compute(X_train, X_test, y_train, y_test)
+
+# Make plots
+ax1 = shap_interpreter.plot('importance')
+ax2 = shap_interpreter.plot('summary')
+ax3 = shap_interpreter.plot('dependence', target_columns=['f1', 'f2'])
+ax4 = shap_interpreter.plot('sample', samples_index=[X_test.index.tolist()[0]])
+
++ + +
+ +probatus/interpret/model_interpret.py
19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 |
|
__init__(model, scoring='roc_auc', verbose=0, random_state=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ classifier or regressor
+ |
+
+
+
+ Model fitted on X_train. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined classification scorers names in sklearn +(link). +Another option is using probatus.utils.Scorer to define a custom metric. + |
+
+ 'roc_auc'
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set for the nr of samples. If it is None, the results will not be reproducible. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/interpret/model_interpret.py
63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 |
|
compute(return_scores=False, shap_variance_penalty_factor=None)
+
+Computes the DataFrame that presents the importance of each feature.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return the train and test score of the model, together with +the model interpretation report. If true, the output of this method is a tuple of DataFrame, float, +float. + |
+
+ False
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame or tuple(DataFrame, float, float)
+ |
+
+
+
+ Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the +model. + |
+
probatus/interpret/model_interpret.py
232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 |
|
fit(X_train, X_test, y_train, y_test, column_names=None, class_names=None, **shap_kwargs)
+
+Fits the object and calculates the shap values for the provided datasets.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X_train |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing training data. + |
+ + required + | +
X_test |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing test data. + |
+ + required + | +
y_train |
+
+ Series
+ |
+
+
+
+ Series of labels for train data. + |
+ + required + | +
y_test |
+
+ Series
+ |
+
+
+
+ Series of labels for test data. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
probatus/interpret/model_interpret.py
93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 |
|
fit_compute(X_train, X_test, y_train, y_test, column_names=None, class_names=None, return_scores=False, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object and calculates the shap values for the provided datasets.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X_train |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing training data. + |
+ + required + | +
X_test |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing test data. + |
+ + required + | +
y_train |
+
+ Series
+ |
+
+
+
+ Series of labels for train data. + |
+ + required + | +
y_test |
+
+ Series
+ |
+
+
+
+ Series of labels for test data. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. +If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. +If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return +the train and test score of the model, +together with the model interpretation report. If true, +the output of this method is a tuple of DataFrame, float, +float. + |
+
+ False
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame or tuple(DataFrame, float, float)
+ |
+
+
+
+ Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the +model. + |
+
probatus/interpret/model_interpret.py
287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 |
|
plot(plot_type, target_set='test', target_columns=None, samples_index=None, show=True, **plot_kwargs)
+
+Plots the appropriate SHAP plot.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plot_type |
+
+ str
+ |
+
+
+
+ One of the following: +
|
+ + required + | +
target_set |
+
+ str
+ |
+
+
+
+ The set for which the plot should be generated, either |
+
+ 'test'
+ |
+
target_columns |
+
+ None, str or list of str
+ |
+
+
+
+ List of features names, for which the plots should be generated. If None, all features will be plotted. + |
+
+ None
+ |
+
samples_index |
+
+ (None, int, list or Index)
+ |
+
+
+
+ Index of samples to be explained if the |
+
+ None
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
**plot_kwargs |
+ + | +
+
+
+ Keyword arguments passed to the plot method. For 'importance' and 'summary' plot_type, the kwargs are +passed to shap.summary_plot, for 'dependence' plot_type, they are passed to +probatus.interpret.DependencePlotter.plot method. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes or list(axes)
+ |
+
+
+
+ An Axes with the plot, or list of axes when multiple plots are returned. + |
+
probatus/interpret/model_interpret.py
360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 |
|
DependencePlotter
+
+
+
+ Bases: BaseFitComputePlotClass
Plotter used to plot SHAP dependence plot together with the target rates.
+Currently it supports tree-based and linear models.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+ + | +
+
+
+ classifier for which interpretation is done. + |
+ + required + | +
Example:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.interpret import DependencePlotter
+
+X, y = make_classification(n_samples=15, n_features=3, n_informative=3, n_redundant=0, random_state=42)
+model = RandomForestClassifier().fit(X, y)
+bdp = DependencePlotter(model)
+shap_values = bdp.fit_compute(X, y)
+
+bdp.plot(feature=2)
+
+
+
+ probatus/interpret/shap_dependence.py
9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 |
|
__init__(model, verbose=0, random_state=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ regression or classification model or pipeline. + |
+ + required + | +
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set for the nr of samples. If it is None, the results will not be reproducible. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/interpret/shap_dependence.py
35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 |
|
__repr__()
+
+Represent string method.
+ +probatus/interpret/shap_dependence.py
58 +59 +60 +61 +62 |
|
compute()
+
+Computes the report returned to the user, namely the SHAP values generated on the dataset.
+ + +Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ SHAP Values for X. + |
+
probatus/interpret/shap_dependence.py
112 +113 +114 +115 +116 +117 +118 +119 +120 +121 |
|
fit(X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs)
+
+Fits the plotter to the model and data by computing the shap values.
+If the shap_values are passed, they do not need to be computed.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ input variables. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ target variable. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
precalc_shap |
+
+ (Optional, None or array)
+ |
+
+
+
+ Precalculated shap values, If provided they don't need to be computed. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
probatus/interpret/shap_dependence.py
64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 |
|
fit_compute(X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs)
+
+Fits the plotter to the model and data by computing the shap values.
+If the shap_values are passed, they do not need to be computed
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
precalc_shap |
+
+ (Optional, None or array)
+ |
+
+
+
+ Precalculated shap values, If provided they don't need to be computed. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ SHAP Values for X. + |
+
probatus/interpret/shap_dependence.py
123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 |
|
plot(feature, figsize=(15, 10), bins=10, show=True, min_q=0, max_q=1, alpha=1.0)
+
+Plots the shap values for data points for a given feature, as well as the target rate and values distribution.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
feature |
+
+ str or int
+ |
+
+
+
+ Feature name of the feature to be analyzed. + |
+ + required + | +
figsize |
+
+ float, float)
+ |
+
+
+
+ Tuple specifying size (width, height) of resulting figure in inches. + |
+
+ (15, 10)
+ |
+
bins |
+
+ int or list[float]
+ |
+
+
+
+ Number of bins or boundaries of bins (supplied in list) for target-rate plot. + |
+
+ 10
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
min_q |
+
+ float
+ |
+
+
+
+ Optional minimum quantile from which to consider values, used for plotting under outliers. + |
+
+ 0
+ |
+
max_q |
+
+ float
+ |
+
+
+
+ Optional maximum quantile until which data points are considered, used for plotting under outliers. + |
+
+ 1
+ |
+
alpha |
+
+ float
+ |
+
+
+
+ Optional alpha blending value, between 0 (transparent) and 1 (opaque). + |
+
+ 1.0
+ |
+
Returns + (list(matplotlib.axes)): + List of axes that include the plots.
+ +probatus/interpret/shap_dependence.py
160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 |
|
The goal of sample similarity module is understanding how different two samples are from a multivariate perspective.
+One of the ways to indicate this is Resemblance Model. Having two datasets - say X1 and X2 - one can analyse how easy it is to recognize which dataset a randomly selected row comes from. The Resemblance model assigns label 0 to the dataset X1, and label 1 to X2 and trains a binary classification model to predict which sample a given row comes from. +By looking at the test AUC, one can conclude that the samples have a different distribution if the AUC is significantly higher than 0.5. Furthermore, by analysing feature importance one can understand which of the features have predictive power.
+ +The following features are implemented:
+BaseResemblanceModel
+
+
+
+ Bases: BaseFitComputePlotClass
This model checks for the similarity of two samples.
+A possible use case is analysis of whether th train sample differs +from the test sample, due to e.g. non-stationarity.
+This is a base class and needs to be extended by a fit() method, which implements how the data is split, +how the model is trained and evaluated. +Further, inheriting classes need to implement how feature importance should be indicated.
+ +probatus/sample_similarity/resemblance_model.py
15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 |
|
__init__(model, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. The recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 |
|
compute(return_scores=False)
+
+Checks if fit() method has been run and computes the output variables.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return a tuple (feature importances, train score, +test score), or feature importances. By default the second option is selected. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ tuple(DataFrame, float, float) or DataFrame
+ |
+
+
+
+ Depending on value of return_tuple either returns a tuple (feature importances, train AUC, test AUC), or +feature importances. + |
+
probatus/sample_similarity/resemblance_model.py
181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 |
|
fit(X1, X2, column_names=None, class_names=None)
+
+Base fit functionality that should be executed before each fit.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ BaseResemblanceModel
+ |
+
+
+
+ Fitted object + |
+
probatus/sample_similarity/resemblance_model.py
87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 |
|
fit_compute(X1, X2, column_names=None, class_names=None, return_scores=False, **fit_kwargs)
+
+Fits the resemblance model and computes the report regarding feature importance.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return a tuple (feature importances, train score, +test score), or feature importances. By default the second option is selected. + |
+
+ False
+ |
+
**fit_kwargs |
+ + | +
+
+
+ In case any other arguments are accepted by fit() method, they can be passed as keyword arguments. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ tuple of (pd.DataFrame, float, float) or pd.DataFrame
+ |
+
+
+
+ Depending on value of return_tuple either returns a tuple (feature importances, train AUC, test AUC), or +feature importances. + |
+
probatus/sample_similarity/resemblance_model.py
202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 |
|
get_data_splits()
+
+Returns the data splits used to train the Resemblance model.
+ + +Returns:
+Type | +Description | +
---|---|
+ (DataFrame, DataFrame, Series, Series)
+ |
+
+
+
+ X_train, X_test, y_train, y_test. + |
+
probatus/sample_similarity/resemblance_model.py
170 +171 +172 +173 +174 +175 +176 +177 +178 +179 |
|
plot()
+
+Plot.
+ +probatus/sample_similarity/resemblance_model.py
245 +246 +247 +248 +249 |
|
PermutationImportanceResemblance
+
+
+
+ Bases: BaseResemblanceModel
This model checks the similarity of two samples.
+A possible use case is analysis of whether the train sample differs +from the test sample, due to e.g. non-stationarity.
+It assigns labels to each sample, 0 to the first sample, 1 to the second. Then, it randomly selects a portion of +data to train on. The resulting model tries to distinguish which sample a given test row comes from. This +provides insights on how distinguishable these samples are and which features contribute to that. The feature +importance is calculated using permutation importance.
+If the model achieves a test AUC significantly different than 0.5, it indicates that it is possible to distinguish +between the samples, and therefore, the samples differ. +Features with a high permutation importance contribute to that effect the most. +Thus, their distribution might differ between two samples.
+Examples:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.sample_similarity import PermutationImportanceResemblance
+X1, _ = make_classification(n_samples=100, n_features=5)
+X2, _ = make_classification(n_samples=100, n_features=5, shift=0.5)
+model = RandomForestClassifier(max_depth=2)
+perm = PermutationImportanceResemblance(model)
+feature_importance = perm.fit_compute(X1, X2)
+perm.plot()
+
+
+
+ probatus/sample_similarity/resemblance_model.py
252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 |
|
__init__(model, iterations=100, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
iterations |
+
+ int
+ |
+
+
+
+ Number of iterations performed to calculate permutation importance. By default 100 iterations per +feature are done. + |
+
+ 100
+ |
+
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. Recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 |
|
fit(X1, X2, column_names=None, class_names=None)
+
+This function assigns labels to each sample, 0 to the first sample, 1 to the second.
+Then, it randomly selects a + portion of data to train on. The resulting model tries to distinguish which sample a given test row + comes from. This provides insights on how distinguishable these samples are and which features contribute to + that. The feature importance is calculated using permutation importance.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ PermutationImportanceResemblance
+ |
+
+
+
+ Fitted object. + |
+
probatus/sample_similarity/resemblance_model.py
348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 |
|
plot(ax=None, top_n=None, show=True, **plot_kwargs)
+
+Plots the resulting AUC of the model as well as the feature importances.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ax |
+
+ axes
+ |
+
+
+
+ Axes to which the output should be plotted. If not provided new axes are created. + |
+
+ None
+ |
+
top_n |
+
+ int
+ |
+
+
+
+ Number of the most important features to be plotted. By default features are included in the plot. + |
+
+ None
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are shown to the user, otherwise they are not shown. Not showing a plot can be useful +when you want to edit the returned axis before showing it. + |
+
+ True
+ |
+
**plot_kwargs |
+ + | +
+
+
+ Keyword arguments passed to the matplotlib.plotly.subplots method. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes
+ |
+
+
+
+ Axes that include the plot. + |
+
probatus/sample_similarity/resemblance_model.py
418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 |
|
SHAPImportanceResemblance
+
+
+
+ Bases: BaseResemblanceModel
This model checks for similarity of two samples.
+A possible use case is analysis of whether the train sample differs + from the test sample, due to e.g. non-stationarity.
+It assigns labels to each sample, 0 to the first sample, 1 to the second. Then, it randomly selects a portion of + data to train on. The resulting model tries to distinguish which sample a given test row comes from. This + provides insights on how distinguishable these samples are and which features contribute to that. The feature + importance is calculated using SHAP feature importance.
+If the model achieves test AUC significantly different than 0.5, it indicates that it is possible to distinguish + between the samples, and therefore, the samples differ. Features with a high permutation importance contribute + to that effect the most. Thus, their distribution might differ between two samples.
+This class currently works only with the Tree based models.
+Examples:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.sample_similarity import SHAPImportanceResemblance
+X1, _ = make_classification(n_samples=100, n_features=5)
+X2, _ = make_classification(n_samples=100, n_features=5, shift=0.5)
+model = RandomForestClassifier(max_depth=2)
+rm = SHAPImportanceResemblance(model)
+feature_importance = rm.fit_compute(X1, X2)
+rm.plot()
+
++
+ +probatus/sample_similarity/resemblance_model.py
482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 |
|
__init__(model, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. Recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 |
|
fit(X1, X2, column_names=None, class_names=None, **shap_kwargs)
+
+This function assigns labels to each sample, 0 to the first sample, 1 to the second.
+Then, it randomly selects a + portion of data to train on. The resulting model tries to distinguish which sample a given test row + comes from. This provides insights on how distinguishable these samples are and which features contribute to + that. The feature importance is calculated using SHAP feature importance.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ SHAPImportanceResemblance
+ |
+
+
+
+ Fitted object. + |
+
probatus/sample_similarity/resemblance_model.py
569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 |
|
get_shap_values()
+
+Gets the SHAP values generated on the test set.
+ + +Returns:
+Type | +Description | +
---|---|
+ array
+ |
+
+
+
+ SHAP values generated on the test set. + |
+
probatus/sample_similarity/resemblance_model.py
664 +665 +666 +667 +668 +669 +670 +671 +672 +673 |
|
plot(plot_type='bar', show=True, **summary_plot_kwargs)
+
+Plots the resulting AUC of the model as well as the feature importances.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plot_type |
+
+ str
+ |
+
+
+
+ Type of plot, used to compute shap.summary_plot. By default 'bar', available ones +are "dot", "bar", "violin", + |
+
+ 'bar'
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
**summary_plot_kwargs |
+ + | +
+
+
+ kwargs passed to the shap.summary_plot. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes
+ |
+
+
+
+ Axes that include the plot. + |
+
probatus/sample_similarity/resemblance_model.py
613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 |
|
This module contains various smaller functionalities that can be used across the probatus
package.
Scorer
+
+
+Scores a given machine learning model based on the provided metric name and optionally a custom scoring function.
+Examples:
+from probatus.utils import Scorer
+from sklearn.metrics import make_scorer
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+import pandas as pd
+
+# Make ROC AUC scorer
+scorer1 = Scorer('roc_auc')
+
+# Make custom scorer with following function:
+def custom_metric(y_true, y_pred):
+ return (y_true == y_pred).sum()
+scorer2 = Scorer('custom_metric', custom_scorer=make_scorer(custom_metric))
+
+# Prepare two samples
+feature_names = ['f1', 'f2', 'f3', 'f4']
+X, y = make_classification(n_samples=1000, n_features=4, random_state=0)
+X = pd.DataFrame(X, columns=feature_names)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Prepare and fit model. Remember about class_weight="balanced" or an equivalent.
+model = RandomForestClassifier(class_weight='balanced', n_estimators = 100, max_depth=2, random_state=0)
+model = model.fit(X_train, y_train)
+
+# Score model
+score_test_scorer1 = scorer1.score(model, X_test, y_test)
+score_test_scorer2 = scorer2.score(model, X_test, y_test)
+
+print(f'Test ROC AUC is {score_test_scorer1}, Test {scorer2.metric_name} is {score_test_scorer2}')
+
+
+ probatus/utils/scoring.py
27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 |
|
__init__(metric_name, custom_scorer=None)
+
+Initializes the class.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
metric_name |
+
+ str
+ |
+
+
+
+ Name of the metric used to evaluate the model. +If the custom_scorer is not passed, the +metric name needs to be aligned with classification scorers names in sklearn +(link). + |
+ + required + | +
custom_scorer |
+
+ sklearn.metrics Scorer callable
+ |
+
+
+
+ Callable +that can score samples. + |
+
+ None
+ |
+
probatus/utils/scoring.py
67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
score(model, X, y)
+
+Scores the samples model based on the provided metric name.
+Args + model (model object): + Model to be scored.
+X (array-like of shape (n_samples,n_features)):
+ Samples on which the model is scored.
+
+y (array-like of shape (n_samples,)):
+ Labels on which the model is scored.
+
+
+
+ Returns:
+Type | +Description | +
---|---|
+ float
+ |
+
+
+
+ Score returned by the model + |
+
probatus/utils/scoring.py
85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 |
|
get_single_scorer(scoring)
+
+Returns Scorer, based on provided input in scoring argument.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined classification scorers names in sklearn +(link). +Another option is using probatus.utils.Scorer to define a custom metric. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Scorer
+ |
+
+
+
+ Scorer that can be used for scoring models + |
+
probatus/utils/scoring.py
4 + 5 + 6 + 7 + 8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 |
|