From 88d15bc7dd2c51f8f3edad45ba7f873b02911cf6 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 9 Feb 2026 08:24:26 -0600 Subject: [PATCH 01/33] Add TextFeatures transformer for extracting text features - Add TextFeatures class to extract 19 text features from string columns - Features include char_count, word_count, sentence_count, digit_count, etc. - Add comprehensive test suite (16 tests) --- feature_engine/text/__init__.py | 9 + feature_engine/text/text_features.py | 327 ++++++++++++++++++++++++++ tests/test_text/__init__.py | 167 +++++++++++++ tests/test_text/test_text_features.py | 167 +++++++++++++ 4 files changed, 670 insertions(+) create mode 100644 feature_engine/text/__init__.py create mode 100644 feature_engine/text/text_features.py create mode 100644 tests/test_text/__init__.py create mode 100644 tests/test_text/test_text_features.py diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py new file mode 100644 index 000000000..14626b79c --- /dev/null +++ b/feature_engine/text/__init__.py @@ -0,0 +1,9 @@ +""" +The module text includes classes to extract features from text/string variables. +""" + +from .text_features import TextFeatures + +__all__ = [ + "TextFeatures", +] diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py new file mode 100644 index 000000000..c06afdf79 --- /dev/null +++ b/feature_engine/text/text_features.py @@ -0,0 +1,327 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X +from feature_engine.tags import _return_tags + +# Available text features and their computation functions +TEXT_FEATURES = { + "char_count": lambda x: x.str.len(), + "word_count": lambda x: x.str.split().str.len(), + "sentence_count": lambda x: x.str.count(r"[.!?]+"), + "avg_word_length": lambda x: x.apply( + lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) + ), + "digit_count": lambda x: x.str.count(r"\d"), + "uppercase_count": lambda x: x.str.count(r"[A-Z]"), + "lowercase_count": lambda x: x.str.count(r"[a-z]"), + "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), + "whitespace_count": lambda x: x.str.count(r"\s"), + "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), + "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), + "is_empty": lambda x: (x.str.len() == 0).astype(int), + "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), + "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), + "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), + "unique_word_ratio": lambda x: x.apply( + lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + ), +} + + +class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + TextFeatures() extracts numerical features from text/string variables. This + transformer is useful for extracting basic text statistics that can be used + as features in machine learning models. + + The transformer can extract various text features including character counts, + word counts, sentence counts, and various ratios and indicators. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type object (string). + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of text/string variables to extract features from. If None, the + transformer will automatically select all object (string) columns. + + features: list, default=None + List of text features to extract. Available features are: + + - 'char_count': Number of characters in the text + - 'word_count': Number of words (whitespace-separated tokens) + - 'sentence_count': Number of sentences (based on .!? punctuation) + - 'avg_word_length': Average length of words + - 'digit_count': Number of digit characters + - 'uppercase_count': Number of uppercase letters + - 'lowercase_count': Number of lowercase letters + - 'special_char_count': Number of special characters (non-alphanumeric) + - 'whitespace_count': Number of whitespace characters + - 'whitespace_ratio': Ratio of whitespace to total characters + - 'digit_ratio': Ratio of digits to total characters + - 'uppercase_ratio': Ratio of uppercase to total characters + - 'has_digits': Binary indicator if text contains digits + - 'has_uppercase': Binary indicator if text contains uppercase + - 'is_empty': Binary indicator if text is empty + - 'starts_with_uppercase': Binary indicator if text starts with uppercase + - 'ends_with_punctuation': Binary indicator if text ends with .!? + - 'unique_word_count': Number of unique words (case-insensitive) + - 'unique_word_ratio': Ratio of unique words to total words + + If None, extracts all available features. + + drop_original: bool, default=False + Whether to drop the original text columns after transformation. + + Attributes + ---------- + variables_: + The list of text variables that will be transformed. + + features_: + The list of features that will be extracted. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. It stores the feature names + and validates input. + + fit_transform: + Fit to data, then transform it. + + transform: + Extract text features and add them to the dataframe. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.encoding.StringSimilarityEncoder : + Encodes categorical variables based on string similarity. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.text import TextFeatures + >>> X = pd.DataFrame({ + ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] + ... }) + >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf.fit(X) + >>> X = tf.transform(X) + >>> X + text text_char_count text_word_count text_has_digits + 0 Hello World! 12 2 0 + 1 Python is GREAT. 16 3 0 + 2 ML rocks 123 12 3 1 + """ + + def __init__( + self, + variables: Union[None, str, List[str]] = None, + features: Union[None, List[str]] = None, + drop_original: bool = False, + ) -> None: + + # Validate variables + if variables is not None: + if isinstance(variables, str): + variables = [variables] + elif not isinstance(variables, list) or not all( + isinstance(v, str) for v in variables + ): + raise ValueError( + "variables must be None, a string, or a list of strings. " + f"Got {type(variables).__name__} instead." + ) + + # Validate features + if features is not None: + if not isinstance(features, list) or not all( + isinstance(f, str) for f in features + ): + raise ValueError( + "features must be None or a list of strings. " + f"Got {type(features).__name__} instead." + ) + invalid_features = set(features) - set(TEXT_FEATURES.keys()) + if invalid_features: + raise ValueError( + f"Invalid features: {invalid_features}. " + f"Available features are: {list(TEXT_FEATURES.keys())}" + ) + + _check_param_drop_original(drop_original) + + self.variables = variables + self.features = features + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Stores feature names and validates that the specified variables are + present and are of string/object type. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: TextFeatures + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Find or validate text variables + if self.variables is None: + # Select object/string columns + self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + if len(self.variables_) == 0: + raise ValueError( + "No object/string columns found in the dataframe. " + "Please specify variables explicitly." + ) + else: + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError( + f"Variables {missing} are not present in the dataframe." + ) + self.variables_ = self.variables + + # Set features to extract + if self.features is None: + self.features_ = list(TEXT_FEATURES.keys()) + else: + self.features_ = self.features + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract text features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the original columns plus the new text features. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Extract features for each text variable + for var in self.variables_: + # Fill NaN with empty string for feature extraction + text_col = X[var].fillna("") + + for feature_name in self.features_: + new_col_name = f"{var}_{feature_name}" + feature_func = TEXT_FEATURES[feature_name] + X[new_col_name] = feature_func(text_col) + + # Fill any NaN values resulting from computation with 0 + X[new_col_name] = X[new_col_name].fillna(0) + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses feature_names_in_. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + # Start with original features + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + # Add new text feature names + for var in self.variables_: + for feature_name in self.features_: + feature_names.append(f"{var}_{feature_name}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["allow_nan"] = True + tags_dict["variables"] = "categorical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py new file mode 100644 index 000000000..f4e9de3ea --- /dev/null +++ b/tests/test_text/__init__.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +class TestTextFeatures: + """Test cases for TextFeatures transformer.""" + + def test_default_all_features(self): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures() + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + def test_specific_features(self): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + def test_specific_variables(self): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + def test_drop_original(self): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + def test_nan_handling(self): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + def test_uppercase_features(self): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + def test_sentence_count(self): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + def test_unique_word_features(self): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + def test_invalid_feature_raises_error(self): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=["invalid_feature"]) + + def test_invalid_variables_raises_error(self): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + def test_missing_variable_raises_error(self): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_no_text_columns_raises_error(self): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + def test_get_feature_names_out_with_drop(self): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py new file mode 100644 index 000000000..f4e9de3ea --- /dev/null +++ b/tests/test_text/test_text_features.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +class TestTextFeatures: + """Test cases for TextFeatures transformer.""" + + def test_default_all_features(self): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures() + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + def test_specific_features(self): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + def test_specific_variables(self): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + def test_drop_original(self): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + def test_nan_handling(self): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + def test_uppercase_features(self): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + def test_sentence_count(self): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + def test_unique_word_features(self): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + def test_invalid_feature_raises_error(self): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=["invalid_feature"]) + + def test_invalid_variables_raises_error(self): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + def test_missing_variable_raises_error(self): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_no_text_columns_raises_error(self): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + def test_get_feature_names_out_with_drop(self): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names From 848670fbc2a9418c9e04e349f68f798ba1d55afc Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 9 Feb 2026 08:34:47 -0600 Subject: [PATCH 02/33] Fix string dtype detection for pandas 2/3 compatibility Use pd.api.types.is_string_dtype() and is_object_dtype() instead of checking dtype == 'object' directly, which fails with StringDtype. --- feature_engine/text/text_features.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index c06afdf79..8a91f235e 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -209,8 +209,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Find or validate text variables if self.variables is None: - # Select object/string columns - self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + # Select object/string columns (handles both object dtype and StringDtype) + self.variables_ = [ + col for col in X.columns + if pd.api.types.is_string_dtype(X[col]) + or pd.api.types.is_object_dtype(X[col]) + ] if len(self.variables_) == 0: raise ValueError( "No object/string columns found in the dataframe. " From d543d583c6f42a60bbfbd648673c89e74a1d3892 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Feb 2026 18:19:42 -0600 Subject: [PATCH 03/33] Restore documentation files for TextFeatures transformer - Restore docs/user_guide/text/TextFeatures.rst and index.rst - Add docs/api_doc/text/TextFeatures.rst and index.rst - Update docs/user_guide/index.rst, docs/api_doc/index.rst, docs/index.rst - Update README.md with Text Features section --- README.md | 4 + docs/api_doc/index.rst | 1 + docs/api_doc/text/TextFeatures.rst | 6 + docs/api_doc/text/index.rst | 13 ++ docs/index.rst | 6 + docs/user_guide/index.rst | 1 + docs/user_guide/text/TextFeatures.rst | 243 ++++++++++++++++++++++++++ docs/user_guide/text/index.rst | 18 ++ 8 files changed, 292 insertions(+) create mode 100644 docs/api_doc/text/TextFeatures.rst create mode 100644 docs/api_doc/text/index.rst create mode 100644 docs/user_guide/text/TextFeatures.rst create mode 100644 docs/user_guide/text/index.rst diff --git a/README.md b/README.md index 82e496d76..1b9d4dc40 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Please share your story by answering 1 quick question * Variable Creation * Variable Selection * Datetime Features +* Text Features * Time Series * Preprocessing * Scaling @@ -146,6 +147,9 @@ Please share your story by answering 1 quick question * DatetimeFeatures * DatetimeSubtraction * DatetimeOrdinal + +### Text Features + * TextFeatures ### Time Series * LagFeatures diff --git a/docs/api_doc/index.rst b/docs/api_doc/index.rst index 4e09a1a31..2a11913fc 100644 --- a/docs/api_doc/index.rst +++ b/docs/api_doc/index.rst @@ -25,6 +25,7 @@ Creation creation/index datetime/index + text/index Selection diff --git a/docs/api_doc/text/TextFeatures.rst b/docs/api_doc/text/TextFeatures.rst new file mode 100644 index 000000000..7b2b4f76f --- /dev/null +++ b/docs/api_doc/text/TextFeatures.rst @@ -0,0 +1,6 @@ +TextFeatures +============ + +.. autoclass:: feature_engine.text.TextFeatures + :members: + diff --git a/docs/api_doc/text/index.rst b/docs/api_doc/text/index.rst new file mode 100644 index 000000000..f87392fdd --- /dev/null +++ b/docs/api_doc/text/index.rst @@ -0,0 +1,13 @@ +.. -*- mode: rst -*- + +Text Features +============= + +Feature-engine's text transformers extract numerical features from text/string +variables. + +.. toctree:: + :maxdepth: 1 + + TextFeatures + diff --git a/docs/index.rst b/docs/index.rst index a04f8d4bb..371827505 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -65,6 +65,7 @@ Feature-engine includes transformers for: - Creation of new features - Feature selection - Datetime features +- Text features - Time series - Preprocessing - Scaling @@ -260,6 +261,11 @@ extract many new features from the date and time parts of the datetime variable: - :doc:`api_doc/datetime/DatetimeSubtraction`: computes subtractions between datetime variables - :doc:`api_doc/datetime/DatetimeOrdinal`: converts datetime variables into ordinal numbers +Text: +~~~~~ + +- :doc:`api_doc/text/TextFeatures`: extracts numerical features from text/string variables + Feature Selection: ~~~~~~~~~~~~~~~~~~ diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index c786e77e1..52c33a8f4 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -28,6 +28,7 @@ Creation creation/index datetime/index + text/index Selection diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst new file mode 100644 index 000000000..0d513e6ab --- /dev/null +++ b/docs/user_guide/text/TextFeatures.rst @@ -0,0 +1,243 @@ +.. _text_features: + +.. currentmodule:: feature_engine.text + +Extracting Features from Text +============================= + +Short pieces of text are often found among the variables in our datasets. For example, in insurance, a text variable can describe the circumstances of an accident. Customer feedback is also stored as a text variable. + +While text data as such can't be used to train machine learning models, we can extract a lot of numerical information from these texts, which can provide predictive features to train machine learning models. + +Feature-engine allows you to quickly extract numerical features from short pieces of text, to complement your predictive models. These features aim to capture a piece of text’s complexity by looking at some statistical parameters of the text, such as the word length and count, the number of words and unique words used, the number of sentences, and so on. :class:`TextFeatures()` extracts many numerical features from text out-of-the-box. + +TextFeatures +============ + +:class:`TextFeatures()` extracts numerical features from text/string variables. +This transformer is useful for extracting basic text statistics that can be used +as features in machine learning models. Users must explicitly specify which columns +contain text data via the ``variables`` parameter. + +Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices, +:class:`TextFeatures()` extracts metadata features that remain in DataFrame format +and can be easily combined with other Feature-engine or sklearn transformers in a pipeline. + +Text Features +------------- + +The transformer can extract the following features from a text piece: + +- **char_count**: Number of characters in the text +- **word_count**: Number of words (whitespace-separated tokens) +- **sentence_count**: Number of sentences (based on .!? punctuation) +- **avg_word_length**: Average length of words +- **digit_count**: Number of digit characters +- **letter_count**: Number of alphabetic characters (a-z, A-Z) +- **uppercase_count**: Number of uppercase letters +- **lowercase_count**: Number of lowercase letters +- **special_char_count**: Number of special characters (non-alphanumeric) +- **whitespace_count**: Number of whitespace characters +- **whitespace_ratio**: Ratio of whitespace to total characters +- **digit_ratio**: Ratio of digits to total characters +- **uppercase_ratio**: Ratio of uppercase to total characters +- **has_digits**: Binary indicator if text contains digits +- **has_uppercase**: Binary indicator if text contains uppercase +- **is_empty**: Binary indicator if text is empty +- **starts_with_uppercase**: Binary indicator if text starts with uppercase +- **ends_with_punctuation**: Binary indicator if text ends with .!? +- **unique_word_count**: Number of unique words (case-insensitive) +- **lexical_diversity**: Ratio of total words to unique words + +The **number of sentences** is inferred by :class:`TextFeatures()` by counting blocks of +sentence-ending punctuation (., !, ?) as a proxy for sentence boundaries. This means that +multiple consecutive punctuation marks (e.g., "!!!" or "??") are counted as a single +sentence-ending, which avoids overestimating the count in emphatic text. + +However, this is still a simple heuristic. It won't handle edge cases like abbreviations +(e.g., 'Dr.', 'U.S.', 'e.g.', 'i.e.') or text without punctuation. These abbreviations +will be counted as sentence endings, resulting in an overestimate of the actual sentence count. + +The features **number of unique words** and **lexical diversity** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the lexical diversity are greater. + +Handling missing values +----------------------- + +By default, :class:`TextFeatures()` raises an error if the variables contain missing values. +This behavior can be changed by setting the parameter ``missing_values`` to ``'ignore'``. +In this case, missing values will be treated as empty strings, and the numerical features +will be calculated accordingly (e.g., word count and character count will be 0). + +.. code:: python + + import pandas as pd + import numpy as np + from feature_engine.text import TextFeatures + + # Create sample data with NaN + X = pd.DataFrame({ + 'text': ['Hello', np.nan, 'World'] + }) + + # Set up the transformer to ignore missing values + tf = TextFeatures( + variables=['text'], + features=['char_count'], + missing_values='ignore' + ) + + # Transform + X_transformed = tf.fit_transform(X) + + print(X_transformed) + +Output: + +.. code-block:: none + + text text_char_count + 0 Hello 5 + 1 NaN 0 + 2 World 5 + +Python demo +----------- + +Let's create a dataframe with text data and extract features: + +.. code:: python + + import pandas as pd + from feature_engine.text import TextFeatures + + # Create sample data + X = pd.DataFrame({ + 'review': [ + 'This product is AMAZING! Best purchase ever.', + 'Not great. Would not recommend.', + 'OK for the price. 3 out of 5 stars.', + 'TERRIBLE!!! DO NOT BUY!', + ], + 'title': [ + 'Great Product', + 'Disappointed', + 'Average', + 'Awful', + ] + }) + +Now let's extract 5 specific text features, the number of words, the number of characters, the number of sentences, whether the text has digits, and the ratio of upper- to lowercase: + +.. code:: python + + # Set up the transformer with specific features + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'sentence_count', 'has_digits', 'uppercase_ratio'] + ) + + # Fit and transform + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed) + +Output: + +.. code-block:: none + + review title review_word_count review_char_count review_sentence_count review_has_digits review_uppercase_ratio + 0 This product is AMAZING! Best purchase ever. Great Product 7 45 2 0 0.066667 + 1 Not great. Would not recommend. Disappointed 5 31 2 0 0.032258 + 2 OK for the price. 3 out of 5 stars. Average 8 35 2 3 0.057143 + 3 TERRIBLE!!! DO NOT BUY! Awful 4 23 2 0 0.608696 + +Extracting all features +~~~~~~~~~~~~~~~~~~~~~~~ + +By default, if no text features are specified, all available features will be extracted: + +.. code:: python + + # Extract all features from a single text column + tf = TextFeatures(variables=['review']) + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed.head()) + +Dropping original columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can drop the original text columns after extracting features, by setting the parameter ``drop_original`` to ``True``: + +.. code:: python + + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count'], + drop_original=True + ) + + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed) + +Combining with scikit-learn Bag-of-Words +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most NLP tasks, it is common to use bag-of-words (e.g., ``CountVectorizer``) or TF-IDF (e.g., ``TfidfVectorizer``) to represent the text. :class:`TextFeatures()` can be used alongside these methods to provide additional metadata that might improve model performance. + +In the following example, we compare a baseline model using only TF-IDF with a model that combines TF-IDF and :class:`TextFeatures()` metadata: + +.. code:: python + + import pandas as pd + from sklearn.datasets import fetch_20newsgroups + from sklearn.model_selection import train_test_split + from sklearn.pipeline import Pipeline + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.compose import ColumnTransformer + from sklearn.linear_model import LogisticRegression + from sklearn.preprocessing import StandardScaler + + from feature_engine.text import TextFeatures + + # Load and split data + data = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.hockey']) + df = pd.DataFrame({'text': data.data, 'target': data.target}) + X_train, X_test, y_train, y_test = train_test_split( + df[['text']], df['target'], test_size=0.3, random_state=42 + ) + + # 1. Baseline: TF-IDF only + tfidf_pipe = Pipeline([ + ('vec', ColumnTransformer([ + ('tfidf', TfidfVectorizer(max_features=500), 'text') + ])), + ('clf', LogisticRegression()) + ]) + tfidf_pipe.fit(X_train, y_train) + print(f"TF-IDF Accuracy: {tfidf_pipe.score(X_test, y_test):.3f}") + + # 2. Combined: TextFeatures + TF-IDF + combined_pipe = Pipeline([ + ('features', ColumnTransformer([ + ('text_meta', TextFeatures(variables=['text']), 'text'), + ('tfidf', TfidfVectorizer(max_features=500), 'text') + ])), + ('scaler', StandardScaler()), + ('clf', LogisticRegression()) + ]) + combined_pipe.fit(X_train, y_train) + print(f"Combined Accuracy: {combined_pipe.score(X_test, y_test):.3f}") + +Output: + +.. code-block:: none + + TF-IDF Accuracy: 0.957 + Combined Accuracy: 0.963 + +By adding statistical metadata through :class:`TextFeatures()`, we provided the model with information about text length, complexity, and style that is not explicitly captured by a word-count-based approach like TF-IDF, leading to a small but noticeable improvement in performance. diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst new file mode 100644 index 000000000..ea23d7362 --- /dev/null +++ b/docs/user_guide/text/index.rst @@ -0,0 +1,18 @@ +.. -*- mode: rst -*- + +Text Feature Extraction +======================= + +Feature-engine's text module includes transformers to extract numerical features +from text/string variables. + +Text feature extraction is useful for machine learning problems where you have +text data but want to derive numerical statistics without creating sparse +bag-of-words or TF-IDF representations. + +**Transformers** + +.. toctree:: + :maxdepth: 1 + + TextFeatures From 1ed7777616e74ae3d9c4b6d5478f8a3c69fccf5a Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Feb 2026 18:22:00 -0600 Subject: [PATCH 04/33] Add tests to improve coverage for TextFeatures transformer - Test string variable auto-conversion to list - Test invalid features type error - Test multiple text columns - Test transform on new data after fit - Test punctuation, ratio, avg_word_length, and lowercase features --- tests/test_text/test_text_features.py | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index f4e9de3ea..b7c716757 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -165,3 +165,80 @@ def test_get_feature_names_out_with_drop(self): assert "text" not in feature_names assert "other" in feature_names assert "text_char_count" in feature_names + + def test_string_variable_input(self): + """Test that passing a single string variable works (auto-converted to list).""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) + transformer = TextFeatures(variables="text", features=["char_count"]) + X_tr = transformer.fit_transform(X) + + assert "text_char_count" in X_tr.columns + assert "other_char_count" not in X_tr.columns + assert X_tr["text_char_count"].tolist() == [5, 5] + + def test_invalid_features_type_raises_error(self): + """Test that invalid features type raises ValueError.""" + with pytest.raises(ValueError, match="features must be"): + TextFeatures(features="char_count") + + def test_multiple_text_columns(self): + """Test extracting features from multiple text columns.""" + X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + assert "a_char_count" in X_tr.columns + assert "b_char_count" in X_tr.columns + assert "a_word_count" in X_tr.columns + assert "b_word_count" in X_tr.columns + + def test_transform_on_new_data(self): + """Test transform works on new data after fit.""" + X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) + X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) + + transformer = TextFeatures(features=["char_count", "has_digits"]) + transformer.fit(X_train) + X_tr = transformer.transform(X_test) + + assert X_tr["text_char_count"].tolist() == [8, 8] + assert X_tr["text_has_digits"].tolist() == [0, 1] + + def test_punctuation_features(self): + """Test punctuation-related features.""" + X = pd.DataFrame({"text": ["Hello.", "World", "Hi!"]}) + transformer = TextFeatures( + features=["ends_with_punctuation", "special_char_count"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 1] + assert X_tr["text_special_char_count"].tolist() == [1, 0, 1] + + def test_ratio_features(self): + """Test ratio features with known values.""" + X = pd.DataFrame({"text": ["AB12", "abcd"]}) + transformer = TextFeatures( + features=["digit_ratio", "uppercase_ratio", "whitespace_ratio"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_digit_ratio"].tolist() == [0.5, 0.0] + assert X_tr["text_uppercase_ratio"].tolist() == [0.5, 0.0] + assert X_tr["text_whitespace_ratio"].tolist() == [0.0, 0.0] + + def test_avg_word_length(self): + """Test average word length feature.""" + X = pd.DataFrame({"text": ["ab cd", "a"]}) + transformer = TextFeatures(features=["avg_word_length"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_avg_word_length"].tolist() == [2.0, 1.0] + + def test_lowercase_count(self): + """Test lowercase count feature.""" + X = pd.DataFrame({"text": ["Hello", "WORLD"]}) + transformer = TextFeatures(features=["lowercase_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_lowercase_count"].tolist() == [4, 0] From a4ab4ad0a81edab9c91be95709bb864eef00960b Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Feb 2026 18:23:15 -0600 Subject: [PATCH 05/33] Fix Sphinx docstring: escape feature_names_in_ reference --- feature_engine/text/text_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 8a91f235e..b7f9885ac 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -295,7 +295,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]: Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, uses feature_names_in_. + Input features. If ``None``, uses ``feature_names_in_``. Returns ------- From 0b8a18312cf87b7126ef736ac3d9599bb47ad141 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Feb 2026 18:38:43 -0600 Subject: [PATCH 06/33] Add tests for full coverage of TextFeatures validation and tags --- tests/test_text/test_text_features.py | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index b7c716757..9b7683bcc 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -242,3 +242,31 @@ def test_lowercase_count(self): X_tr = transformer.fit_transform(X) assert X_tr["text_lowercase_count"].tolist() == [4, 0] + + def test_variables_list_non_strings_raises_error(self): + """Test that a list of non-string variables raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=[1, 2]) + + def test_features_list_non_strings_raises_error(self): + """Test that a list of non-string features raises ValueError.""" + with pytest.raises(ValueError, match="features must be"): + TextFeatures(features=[1, 2]) + + def test_more_tags(self): + """Test _more_tags returns expected tags.""" + transformer = TextFeatures() + tags = transformer._more_tags() + assert tags["allow_nan"] is True + assert tags["variables"] == "categorical" + + def test_sklearn_tags(self): + """Test __sklearn_tags__ returns expected tags.""" + import sklearn + + if hasattr(sklearn, "__version__") and tuple( + int(x) for x in sklearn.__version__.split(".")[:2] + ) >= (1, 6): + transformer = TextFeatures() + tags = transformer.__sklearn_tags__() + assert tags.input_tags.allow_nan is True From c38e81a7c82ca3ad9cc7479bc903e98695e99f96 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 12 Feb 2026 18:56:05 -0600 Subject: [PATCH 07/33] Address reviewer comments and optimize TextFeatures extraction --- docs/user_guide/text/TextFeatures.rst | 6 +- docs/user_guide/text/index.rst | 2 +- feature_engine/text/text_features.py | 92 +++++++++++++-------------- tests/test_text/test_text_features.py | 69 +++++++++++++------- 4 files changed, 96 insertions(+), 73 deletions(-) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index 0d513e6ab..d8383b899 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -47,7 +47,7 @@ The transformer can extract the following features from a text piece: - **starts_with_uppercase**: Binary indicator if text starts with uppercase - **ends_with_punctuation**: Binary indicator if text ends with .!? - **unique_word_count**: Number of unique words (case-insensitive) -- **lexical_diversity**: Ratio of total words to unique words +- **unique_word_ratio**: Ratio of unique words to total words The **number of sentences** is inferred by :class:`TextFeatures()` by counting blocks of sentence-ending punctuation (., !, ?) as a proxy for sentence boundaries. This means that @@ -58,7 +58,7 @@ However, this is still a simple heuristic. It won't handle edge cases like abbre (e.g., 'Dr.', 'U.S.', 'e.g.', 'i.e.') or text without punctuation. These abbreviations will be counted as sentence endings, resulting in an overestimate of the actual sentence count. -The features **number of unique words** and **lexical diversity** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the lexical diversity are greater. +The features **number of unique words** and **unique word ratio** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the unique word ratio are greater. Handling missing values ----------------------- @@ -153,7 +153,7 @@ Output: 3 TERRIBLE!!! DO NOT BUY! Awful 4 23 2 0 0.608696 Extracting all features -~~~~~~~~~~~~~~~~~~~~~~~ +----------------------- By default, if no text features are specified, all available features will be extracted: diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst index ea23d7362..0a7ce55bb 100644 --- a/docs/user_guide/text/index.rst +++ b/docs/user_guide/text/index.rst @@ -7,7 +7,7 @@ Feature-engine's text module includes transformers to extract numerical features from text/string variables. Text feature extraction is useful for machine learning problems where you have -text data but want to derive numerical statistics without creating sparse +text data but want to derive numerical statistics without, or in addition to, creating sparse bag-of-words or TF-IDF representations. **Transformers** diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index b7f9885ac..091b53454 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -19,10 +19,12 @@ "char_count": lambda x: x.str.len(), "word_count": lambda x: x.str.split().str.len(), "sentence_count": lambda x: x.str.count(r"[.!?]+"), - "avg_word_length": lambda x: x.apply( - lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) - ), + "avg_word_length": lambda x: ( + x.str.replace(r"\s+", "", regex=True).str.len() / + x.str.split().str.len() + ).fillna(0), "digit_count": lambda x: x.str.count(r"\d"), + "letter_count": lambda x: x.str.count(r"[a-zA-Z]"), "uppercase_count": lambda x: x.str.count(r"[A-Z]"), "lowercase_count": lambda x: x.str.count(r"[a-z]"), "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), @@ -35,10 +37,13 @@ "is_empty": lambda x: (x.str.len() == 0).astype(int), "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), - "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), - "unique_word_ratio": lambda x: x.apply( - lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + "unique_word_count": lambda x: ( + x.str.lower().str.split().apply(lambda s: len(set(s)) if isinstance(s, list) else 0) ), + "unique_word_ratio": lambda x: ( + x.str.lower().str.split().apply(lambda s: len(set(s)) if isinstance(s, list) else 0) / + x.str.split().str.len() + ).fillna(0), } @@ -48,19 +53,14 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): transformer is useful for extracting basic text statistics that can be used as features in machine learning models. - The transformer can extract various text features including character counts, - word counts, sentence counts, and various ratios and indicators. - - A list of variables can be passed as an argument. Alternatively, the transformer - will automatically select and transform all variables of type object (string). + A list of variables must be passed as an argument. More details in the :ref:`User Guide `. Parameters ---------- - variables: list, default=None - The list of text/string variables to extract features from. If None, the - transformer will automatically select all object (string) columns. + variables: list + The list of text/string variables to extract features from. features: list, default=None List of text features to extract. Available features are: @@ -70,6 +70,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): - 'sentence_count': Number of sentences (based on .!? punctuation) - 'avg_word_length': Average length of words - 'digit_count': Number of digit characters + - 'letter_count': Number of alphabetic characters (a-z, A-Z) - 'uppercase_count': Number of uppercase letters - 'lowercase_count': Number of lowercase letters - 'special_char_count': Number of special characters (non-alphanumeric) @@ -132,7 +133,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): >>> X = pd.DataFrame({ ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] ... }) - >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf = TextFeatures(variables=['text'], features=['char_count', 'word_count', 'has_digits']) >>> tf.fit(X) >>> X = tf.transform(X) >>> X @@ -144,22 +145,21 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): def __init__( self, - variables: Union[None, str, List[str]] = None, + variables: Union[str, List[str]], features: Union[None, List[str]] = None, drop_original: bool = False, ) -> None: # Validate variables - if variables is not None: - if isinstance(variables, str): - variables = [variables] - elif not isinstance(variables, list) or not all( - isinstance(v, str) for v in variables - ): - raise ValueError( - "variables must be None, a string, or a list of strings. " - f"Got {type(variables).__name__} instead." - ) + if isinstance(variables, str): + variables = [variables] + if not isinstance(variables, list) or not all( + isinstance(v, str) for v in variables + ): + raise ValueError( + "variables must be a string or a list of strings. " + f"Got {type(variables).__name__} instead." + ) # Validate features if features is not None: @@ -207,27 +207,27 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X = check_X(X) - # Find or validate text variables - if self.variables is None: - # Select object/string columns (handles both object dtype and StringDtype) - self.variables_ = [ - col for col in X.columns - if pd.api.types.is_string_dtype(X[col]) + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError(f"Variables {missing} are not present in the dataframe.") + + # Validate that the variables are object or string + non_text = [ + col + for col in self.variables + if not ( + pd.api.types.is_string_dtype(X[col]) or pd.api.types.is_object_dtype(X[col]) - ] - if len(self.variables_) == 0: - raise ValueError( - "No object/string columns found in the dataframe. " - "Please specify variables explicitly." - ) - else: - # Validate user-specified variables exist - missing = set(self.variables) - set(X.columns) - if missing: - raise ValueError( - f"Variables {missing} are not present in the dataframe." - ) - self.variables_ = self.variables + ) + ] + if non_text: + raise ValueError( + f"Variables {non_text} are not object or string. " + "Please provide text variables only." + ) + + self.variables_ = self.variables # Set features to extract if self.features is None: diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 9b7683bcc..63bbb08aa 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -10,7 +10,7 @@ class TestTextFeatures: def test_default_all_features(self): """Test extracting all features with default parameters.""" X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) - transformer = TextFeatures() + transformer = TextFeatures(variables=["text"]) X_tr = transformer.fit_transform(X) # Check that new columns were added @@ -30,7 +30,7 @@ def test_default_all_features(self): def test_specific_features(self): """Test extracting specific features only.""" X = pd.DataFrame({"text": ["Hello", "World"]}) - transformer = TextFeatures(features=["char_count", "word_count"]) + transformer = TextFeatures(variables=["text"], features=["char_count", "word_count"]) X_tr = transformer.fit_transform(X) # Check only specified features are extracted @@ -54,7 +54,7 @@ def test_specific_variables(self): def test_drop_original(self): """Test drop_original parameter.""" X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) - transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer = TextFeatures(variables=["text"], features=["char_count"], drop_original=True) X_tr = transformer.fit_transform(X) assert "text" not in X_tr.columns @@ -64,7 +64,9 @@ def test_drop_original(self): def test_empty_string_handling(self): """Test handling of empty strings.""" X = pd.DataFrame({"text": ["", "Hello", ""]}) - transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + transformer = TextFeatures( + variables=["text"], features=["char_count", "word_count", "is_empty"] + ) X_tr = transformer.fit_transform(X) assert X_tr["text_char_count"].tolist() == [0, 5, 0] @@ -73,17 +75,34 @@ def test_empty_string_handling(self): def test_nan_handling(self): """Test handling of NaN values.""" X = pd.DataFrame({"text": ["Hello", None, "World"]}) - transformer = TextFeatures(features=["char_count"]) + transformer = TextFeatures(variables=["text"], features=["char_count"]) X_tr = transformer.fit_transform(X) # NaN should be filled with empty string, resulting in char_count of 0 assert X_tr["text_char_count"].tolist() == [5, 0, 5] + def test_letter_count(self): + """Test letter count feature.""" + X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) + transformer = TextFeatures(features=["letter_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_letter_count"].tolist() == [5, 5, 3] + + def test_letter_count(self): + """Test letter count feature.""" + X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) + transformer = TextFeatures(variables=["text"], features=["letter_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_letter_count"].tolist() == [5, 5, 3] + def test_uppercase_features(self): """Test uppercase-related features.""" X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) transformer = TextFeatures( - features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + variables=["text"], + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"], ) X_tr = transformer.fit_transform(X) @@ -94,7 +113,7 @@ def test_uppercase_features(self): def test_sentence_count(self): """Test sentence counting.""" X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) - transformer = TextFeatures(features=["sentence_count"]) + transformer = TextFeatures(variables=["text"], features=["sentence_count"]) X_tr = transformer.fit_transform(X) assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] @@ -102,7 +121,9 @@ def test_sentence_count(self): def test_unique_word_features(self): """Test unique word features.""" X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) - transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + transformer = TextFeatures( + variables=["text"], features=["unique_word_count", "unique_word_ratio"] + ) X_tr = transformer.fit_transform(X) assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] @@ -111,7 +132,7 @@ def test_unique_word_features(self): def test_invalid_feature_raises_error(self): """Test that invalid feature name raises ValueError.""" with pytest.raises(ValueError, match="Invalid features"): - TextFeatures(features=["invalid_feature"]) + TextFeatures(variables=["text"], features=["invalid_feature"]) def test_invalid_variables_raises_error(self): """Test that invalid variables parameter raises ValueError.""" @@ -126,16 +147,16 @@ def test_missing_variable_raises_error(self): transformer.fit(X) def test_no_text_columns_raises_error(self): - """Test that no text columns raises error when variables=None.""" + """Test that no text columns raises error.""" X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transformer = TextFeatures() - with pytest.raises(ValueError, match="No object/string columns found"): + transformer = TextFeatures(variables=["a"]) + with pytest.raises(ValueError, match="not object or string"): transformer.fit(X) def test_fit_stores_attributes(self): """Test that fit stores expected attributes.""" X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures() + transformer = TextFeatures(variables=["text"]) transformer.fit(X) assert hasattr(transformer, "variables_") @@ -146,7 +167,7 @@ def test_fit_stores_attributes(self): def test_get_feature_names_out(self): """Test get_feature_names_out returns correct names.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(features=["char_count", "word_count"]) + transformer = TextFeatures(variables=["text"], features=["char_count", "word_count"]) transformer.fit(X) feature_names = transformer.get_feature_names_out() @@ -158,7 +179,7 @@ def test_get_feature_names_out(self): def test_get_feature_names_out_with_drop(self): """Test get_feature_names_out with drop_original=True.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer = TextFeatures(variables=["text"], features=["char_count"], drop_original=True) transformer.fit(X) feature_names = transformer.get_feature_names_out() @@ -179,12 +200,12 @@ def test_string_variable_input(self): def test_invalid_features_type_raises_error(self): """Test that invalid features type raises ValueError.""" with pytest.raises(ValueError, match="features must be"): - TextFeatures(features="char_count") + TextFeatures(variables=["text"], features="char_count") def test_multiple_text_columns(self): """Test extracting features from multiple text columns.""" X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) - transformer = TextFeatures(features=["char_count", "word_count"]) + transformer = TextFeatures(variables=["a", "b"], features=["char_count", "word_count"]) X_tr = transformer.fit_transform(X) assert "a_char_count" in X_tr.columns @@ -197,7 +218,7 @@ def test_transform_on_new_data(self): X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) - transformer = TextFeatures(features=["char_count", "has_digits"]) + transformer = TextFeatures(variables=["text"], features=["char_count", "has_digits"]) transformer.fit(X_train) X_tr = transformer.transform(X_test) @@ -208,6 +229,7 @@ def test_punctuation_features(self): """Test punctuation-related features.""" X = pd.DataFrame({"text": ["Hello.", "World", "Hi!"]}) transformer = TextFeatures( + variables=["text"], features=["ends_with_punctuation", "special_char_count"] ) X_tr = transformer.fit_transform(X) @@ -219,6 +241,7 @@ def test_ratio_features(self): """Test ratio features with known values.""" X = pd.DataFrame({"text": ["AB12", "abcd"]}) transformer = TextFeatures( + variables=["text"], features=["digit_ratio", "uppercase_ratio", "whitespace_ratio"] ) X_tr = transformer.fit_transform(X) @@ -230,7 +253,7 @@ def test_ratio_features(self): def test_avg_word_length(self): """Test average word length feature.""" X = pd.DataFrame({"text": ["ab cd", "a"]}) - transformer = TextFeatures(features=["avg_word_length"]) + transformer = TextFeatures(variables=["text"], features=["avg_word_length"]) X_tr = transformer.fit_transform(X) assert X_tr["text_avg_word_length"].tolist() == [2.0, 1.0] @@ -238,7 +261,7 @@ def test_avg_word_length(self): def test_lowercase_count(self): """Test lowercase count feature.""" X = pd.DataFrame({"text": ["Hello", "WORLD"]}) - transformer = TextFeatures(features=["lowercase_count"]) + transformer = TextFeatures(variables=["text"], features=["lowercase_count"]) X_tr = transformer.fit_transform(X) assert X_tr["text_lowercase_count"].tolist() == [4, 0] @@ -251,11 +274,11 @@ def test_variables_list_non_strings_raises_error(self): def test_features_list_non_strings_raises_error(self): """Test that a list of non-string features raises ValueError.""" with pytest.raises(ValueError, match="features must be"): - TextFeatures(features=[1, 2]) + TextFeatures(variables=["text"], features=[1, 2]) def test_more_tags(self): """Test _more_tags returns expected tags.""" - transformer = TextFeatures() + transformer = TextFeatures(variables=["text"]) tags = transformer._more_tags() assert tags["allow_nan"] is True assert tags["variables"] == "categorical" @@ -267,6 +290,6 @@ def test_sklearn_tags(self): if hasattr(sklearn, "__version__") and tuple( int(x) for x in sklearn.__version__.split(".")[:2] ) >= (1, 6): - transformer = TextFeatures() + transformer = TextFeatures(variables=["text"]) tags = transformer.__sklearn_tags__() assert tags.input_tags.allow_nan is True From ef22d6f41d27d08b13f80f477c2e387a0d804f90 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 12 Feb 2026 18:58:18 -0600 Subject: [PATCH 08/33] Fix style violations and broken tests --- feature_engine/text/text_features.py | 16 +++++++---- tests/test_text/test_text_features.py | 40 +++++++++++++++++---------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 091b53454..8b620efcb 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -38,11 +38,14 @@ "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), "unique_word_count": lambda x: ( - x.str.lower().str.split().apply(lambda s: len(set(s)) if isinstance(s, list) else 0) + x.str.lower().str.split().apply( + lambda s: len(set(s)) if isinstance(s, list) else 0 + ) ), "unique_word_ratio": lambda x: ( - x.str.lower().str.split().apply(lambda s: len(set(s)) if isinstance(s, list) else 0) / - x.str.split().str.len() + x.str.lower().str.split().apply( + lambda s: len(set(s)) if isinstance(s, list) else 0 + ) / x.str.split().str.len() ).fillna(0), } @@ -133,7 +136,10 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): >>> X = pd.DataFrame({ ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] ... }) - >>> tf = TextFeatures(variables=['text'], features=['char_count', 'word_count', 'has_digits']) + >>> tf = TextFeatures( + ... variables=['text'], + ... features=['char_count', 'word_count', 'has_digits'] + ... ) >>> tf.fit(X) >>> X = tf.transform(X) >>> X @@ -146,7 +152,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): def __init__( self, variables: Union[str, List[str]], - features: Union[None, List[str]] = None, + features: Optional[List[str]] = None, drop_original: bool = False, ) -> None: diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 63bbb08aa..fbaac6336 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -30,7 +30,10 @@ def test_default_all_features(self): def test_specific_features(self): """Test extracting specific features only.""" X = pd.DataFrame({"text": ["Hello", "World"]}) - transformer = TextFeatures(variables=["text"], features=["char_count", "word_count"]) + transformer = TextFeatures( + variables=["text"], + features=["char_count", "word_count"] + ) X_tr = transformer.fit_transform(X) # Check only specified features are extracted @@ -54,7 +57,11 @@ def test_specific_variables(self): def test_drop_original(self): """Test drop_original parameter.""" X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) - transformer = TextFeatures(variables=["text"], features=["char_count"], drop_original=True) + transformer = TextFeatures( + variables=["text"], + features=["char_count"], + drop_original=True + ) X_tr = transformer.fit_transform(X) assert "text" not in X_tr.columns @@ -81,14 +88,6 @@ def test_nan_handling(self): # NaN should be filled with empty string, resulting in char_count of 0 assert X_tr["text_char_count"].tolist() == [5, 0, 5] - def test_letter_count(self): - """Test letter count feature.""" - X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) - transformer = TextFeatures(features=["letter_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_letter_count"].tolist() == [5, 5, 3] - def test_letter_count(self): """Test letter count feature.""" X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) @@ -167,7 +166,10 @@ def test_fit_stores_attributes(self): def test_get_feature_names_out(self): """Test get_feature_names_out returns correct names.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(variables=["text"], features=["char_count", "word_count"]) + transformer = TextFeatures( + variables=["text"], + features=["char_count", "word_count"] + ) transformer.fit(X) feature_names = transformer.get_feature_names_out() @@ -179,7 +181,11 @@ def test_get_feature_names_out(self): def test_get_feature_names_out_with_drop(self): """Test get_feature_names_out with drop_original=True.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(variables=["text"], features=["char_count"], drop_original=True) + transformer = TextFeatures( + variables=["text"], + features=["char_count"], + drop_original=True + ) transformer.fit(X) feature_names = transformer.get_feature_names_out() @@ -205,7 +211,10 @@ def test_invalid_features_type_raises_error(self): def test_multiple_text_columns(self): """Test extracting features from multiple text columns.""" X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) - transformer = TextFeatures(variables=["a", "b"], features=["char_count", "word_count"]) + transformer = TextFeatures( + variables=["a", "b"], + features=["char_count", "word_count"] + ) X_tr = transformer.fit_transform(X) assert "a_char_count" in X_tr.columns @@ -218,7 +227,10 @@ def test_transform_on_new_data(self): X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) - transformer = TextFeatures(variables=["text"], features=["char_count", "has_digits"]) + transformer = TextFeatures( + variables=["text"], + features=["char_count", "has_digits"] + ) transformer.fit(X_train) X_tr = transformer.transform(X_test) From cbaf426a5bceffe328bdac6ac38f89cbdfc12953 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 16 Feb 2026 09:40:30 -0600 Subject: [PATCH 09/33] Address reviewer suggestions: rename unique_word_ratio to lexical_diversity, improve docs, refactor tests --- docs/user_guide/text/TextFeatures.rst | 125 +++++- feature_engine/text/text_features.py | 19 +- tests/test_text/__init__.py | 167 ------- tests/test_text/test_text_features.py | 613 ++++++++++++++------------ 4 files changed, 432 insertions(+), 492 deletions(-) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index d8383b899..659c98570 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -12,7 +12,7 @@ While text data as such can't be used to train machine learning models, we can e Feature-engine allows you to quickly extract numerical features from short pieces of text, to complement your predictive models. These features aim to capture a piece of text’s complexity by looking at some statistical parameters of the text, such as the word length and count, the number of words and unique words used, the number of sentences, and so on. :class:`TextFeatures()` extracts many numerical features from text out-of-the-box. TextFeatures -============ +------------ :class:`TextFeatures()` extracts numerical features from text/string variables. This transformer is useful for extracting basic text statistics that can be used @@ -26,7 +26,7 @@ and can be easily combined with other Feature-engine or sklearn transformers in Text Features ------------- -The transformer can extract the following features from a text piece: +:class:`TextFeatures()` can extract the following features from a text piece: - **char_count**: Number of characters in the text - **word_count**: Number of words (whitespace-separated tokens) @@ -47,7 +47,7 @@ The transformer can extract the following features from a text piece: - **starts_with_uppercase**: Binary indicator if text starts with uppercase - **ends_with_punctuation**: Binary indicator if text ends with .!? - **unique_word_count**: Number of unique words (case-insensitive) -- **unique_word_ratio**: Ratio of unique words to total words +- **lexical_diversity**: Ratio of unique words to total words The **number of sentences** is inferred by :class:`TextFeatures()` by counting blocks of sentence-ending punctuation (., !, ?) as a proxy for sentence boundaries. This means that @@ -58,7 +58,7 @@ However, this is still a simple heuristic. It won't handle edge cases like abbre (e.g., 'Dr.', 'U.S.', 'e.g.', 'i.e.') or text without punctuation. These abbreviations will be counted as sentence endings, resulting in an overestimate of the actual sentence count. -The features **number of unique words** and **unique word ratio** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the unique word ratio are greater. +The features **number of unique words** and **lexical diversity** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the lexical diversity are greater. Handling missing values ----------------------- @@ -66,7 +66,7 @@ Handling missing values By default, :class:`TextFeatures()` raises an error if the variables contain missing values. This behavior can be changed by setting the parameter ``missing_values`` to ``'ignore'``. In this case, missing values will be treated as empty strings, and the numerical features -will be calculated accordingly (e.g., word count and character count will be 0). +will be calculated accordingly (e.g., word count and character count will be 0) as shown in the following example: .. code:: python @@ -91,7 +91,7 @@ will be calculated accordingly (e.g., word count and character count will be 0). print(X_transformed) -Output: +In the resulting dataframe, we see that the row with NaN returned 0 in the character count: .. code-block:: none @@ -126,7 +126,19 @@ Let's create a dataframe with text data and extract features: ] }) -Now let's extract 5 specific text features, the number of words, the number of characters, the number of sentences, whether the text has digits, and the ratio of upper- to lowercase: + print(X) + +The input dataframe looks like this: + +.. code-block:: none + + review title + 0 This product is AMAZING! Best purchase ever. Great Product + 1 Not great. Would not recommend. Disappointed + 2 OK for the price. 3 out of 5 stars. Average + 3 TERRIBLE!!! DO NOT BUY! Awful + +Now let's extract 5 specific text features: the number of words, the number of characters, the number of sentences, whether the text has digits, and the ratio of upper- to lowercase: .. code:: python @@ -142,18 +154,24 @@ Now let's extract 5 specific text features, the number of words, the number of c print(X_transformed) -Output: +In the following output, we see the resulting dataframe containing the numerical features extracted from the pieces of text: .. code-block:: none - review title review_word_count review_char_count review_sentence_count review_has_digits review_uppercase_ratio - 0 This product is AMAZING! Best purchase ever. Great Product 7 45 2 0 0.066667 - 1 Not great. Would not recommend. Disappointed 5 31 2 0 0.032258 - 2 OK for the price. 3 out of 5 stars. Average 8 35 2 3 0.057143 - 3 TERRIBLE!!! DO NOT BUY! Awful 4 23 2 0 0.608696 + review title review_word_count review_char_count + 0 This product is AMAZING! Best purchase ever. Great Product 7 45 + 1 Not great. Would not recommend. Disappointed 5 31 + 2 OK for the price. 3 out of 5 stars. Average 8 35 + 3 TERRIBLE!!! DO NOT BUY! Awful 4 23 + + review_sentence_count review_has_digits review_uppercase_ratio + 0 2 0 0.066667 + 1 2 0 0.032258 + 2 2 3 0.057143 + 3 2 0 0.608696 Extracting all features ------------------------ +~~~~~~~~~~~~~~~~~~~~~~~ By default, if no text features are specified, all available features will be extracted: @@ -166,6 +184,45 @@ By default, if no text features are specified, all available features will be ex print(X_transformed.head()) +The output dataframe contains all 20 text features extracted from the ``review`` column: + +.. code-block:: none + + review_char_count review_word_count review_sentence_count review_avg_word_length + 0 45 7 2 5.428571 + 1 31 5 2 5.200000 + 2 35 8 2 3.375000 + 3 23 4 2 4.750000 + + review_digit_count review_letter_count review_uppercase_count review_lowercase_count + 0 0 38 3 35 + 1 0 26 1 25 + 2 2 25 2 23 + 3 0 14 14 0 + + review_special_char_count review_whitespace_count review_whitespace_ratio + 0 1 6 0.133333 + 1 2 4 0.129032 + 2 2 7 0.200000 + 3 3 3 0.130435 + + review_digit_ratio review_uppercase_ratio review_has_digits review_has_uppercase + 0 0.000000 0.066667 0 1 + 1 0.000000 0.032258 0 1 + 2 0.057143 0.057143 1 1 + 3 0.000000 0.608696 0 1 + + review_is_empty review_starts_with_uppercase review_ends_with_punctuation + 0 0 1 1 + 1 0 1 1 + 2 0 1 1 + 3 0 1 1 + + review_unique_word_count review_lexical_diversity + 0 7 1.000000 + 1 4 0.800000 + 2 8 1.000000 + 3 4 1.000000 Dropping original columns ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -184,6 +241,16 @@ You can drop the original text columns after extracting features, by setting the print(X_transformed) +The original ``review`` column has been removed, and only the ``title`` column and the extracted features remain: + +.. code-block:: none + + title review_word_count review_char_count + 0 Great Product 7 45 + 1 Disappointed 5 31 + 2 Average 8 35 + 3 Awful 4 23 + Combining with scikit-learn Bag-of-Words ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -196,11 +263,6 @@ In the following example, we compare a baseline model using only TF-IDF with a m import pandas as pd from sklearn.datasets import fetch_20newsgroups from sklearn.model_selection import train_test_split - from sklearn.pipeline import Pipeline - from sklearn.feature_extraction.text import TfidfVectorizer - from sklearn.compose import ColumnTransformer - from sklearn.linear_model import LogisticRegression - from sklearn.preprocessing import StandardScaler from feature_engine.text import TextFeatures @@ -211,6 +273,29 @@ In the following example, we compare a baseline model using only TF-IDF with a m df[['text']], df['target'], test_size=0.3, random_state=42 ) + print(X_train.head()) + +The input dataframe contains the raw text of newsgroup posts: + +.. code-block:: none + + text + 562 From: xxx@yyy.zzz (John Smith)\nSubject: Re:... + 459 From: aaa@bbb.ccc (Jane Doe)\nSubject: Shutt... + 21 From: ddd@eee.fff\nSubject: Space Station Fr... + 892 From: ggg@hhh.iii\nSubject: NHL Scores\nOrga... + 317 From: jjj@kkk.lll (Bob Wilson)\nSubject: Re:... + +Now let's set up two pipelines to compare a baseline model using only TF-IDF with a model that combines TF-IDF and :class:`TextFeatures()` metadata: + +.. code:: python + + from sklearn.pipeline import Pipeline + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.compose import ColumnTransformer + from sklearn.linear_model import LogisticRegression + from sklearn.preprocessing import StandardScaler + # 1. Baseline: TF-IDF only tfidf_pipe = Pipeline([ ('vec', ColumnTransformer([ @@ -233,7 +318,7 @@ In the following example, we compare a baseline model using only TF-IDF with a m combined_pipe.fit(X_train, y_train) print(f"Combined Accuracy: {combined_pipe.score(X_test, y_test):.3f}") -Output: +Below we see the accuracy of a model trained using only the bag of words, respect to a model trained using both the bag of words and the additional meta data: .. code-block:: none diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 8b620efcb..42602c43e 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -42,7 +42,7 @@ lambda s: len(set(s)) if isinstance(s, list) else 0 ) ), - "unique_word_ratio": lambda x: ( + "lexical_diversity": lambda x: ( x.str.lower().str.split().apply( lambda s: len(set(s)) if isinstance(s, list) else 0 ) / x.str.split().str.len() @@ -62,7 +62,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): Parameters ---------- - variables: list + variables: string, list The list of text/string variables to extract features from. features: list, default=None @@ -87,7 +87,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): - 'starts_with_uppercase': Binary indicator if text starts with uppercase - 'ends_with_punctuation': Binary indicator if text ends with .!? - 'unique_word_count': Number of unique words (case-insensitive) - - 'unique_word_ratio': Ratio of unique words to total words + - 'lexical_diversity': Ratio of unique words to total words If None, extracts all available features. @@ -111,8 +111,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): Methods ------- fit: - This transformer does not learn parameters. It stores the feature names - and validates input. + This transformer does not learn parameters. fit_transform: Fit to data, then transform it. @@ -191,8 +190,6 @@ def __init__( def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ - This transformer does not learn parameters. - Stores feature names and validates that the specified variables are present and are of string/object type. @@ -276,15 +273,15 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # reorder variables to match train set X = X[self.feature_names_in_] + # Fill NaN with empty string for feature extraction + X[self.variables_] = X[self.variables_].fillna("") + # Extract features for each text variable for var in self.variables_: - # Fill NaN with empty string for feature extraction - text_col = X[var].fillna("") - for feature_name in self.features_: new_col_name = f"{var}_{feature_name}" feature_func = TEXT_FEATURES[feature_name] - X[new_col_name] = feature_func(text_col) + X[new_col_name] = feature_func(X[var]) # Fill any NaN values resulting from computation with 0 X[new_col_name] = X[new_col_name].fillna(0) diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py index f4e9de3ea..e69de29bb 100644 --- a/tests/test_text/__init__.py +++ b/tests/test_text/__init__.py @@ -1,167 +0,0 @@ -import pandas as pd -import pytest - -from feature_engine.text import TextFeatures - - -class TestTextFeatures: - """Test cases for TextFeatures transformer.""" - - def test_default_all_features(self): - """Test extracting all features with default parameters.""" - X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) - transformer = TextFeatures() - X_tr = transformer.fit_transform(X) - - # Check that new columns were added - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" in X_tr.columns - - # Check char_count - assert X_tr["text_char_count"].tolist() == [12, 10, 2] - - # Check word_count - assert X_tr["text_word_count"].tolist() == [2, 2, 1] - - # Check digit_count - assert X_tr["text_digit_count"].tolist() == [0, 3, 0] - - def test_specific_features(self): - """Test extracting specific features only.""" - X = pd.DataFrame({"text": ["Hello", "World"]}) - transformer = TextFeatures(features=["char_count", "word_count"]) - X_tr = transformer.fit_transform(X) - - # Check only specified features are extracted - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" not in X_tr.columns - assert "text_uppercase_count" not in X_tr.columns - - def test_specific_variables(self): - """Test extracting features from specific variables only.""" - X = pd.DataFrame( - {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} - ) - transformer = TextFeatures(variables=["text1"], features=["char_count"]) - X_tr = transformer.fit_transform(X) - - # Only text1 should have features extracted - assert "text1_char_count" in X_tr.columns - assert "text2_char_count" not in X_tr.columns - - def test_drop_original(self): - """Test drop_original parameter.""" - X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) - transformer = TextFeatures(features=["char_count"], drop_original=True) - X_tr = transformer.fit_transform(X) - - assert "text" not in X_tr.columns - assert "text_char_count" in X_tr.columns - assert "other" in X_tr.columns - - def test_empty_string_handling(self): - """Test handling of empty strings.""" - X = pd.DataFrame({"text": ["", "Hello", ""]}) - transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_char_count"].tolist() == [0, 5, 0] - assert X_tr["text_is_empty"].tolist() == [1, 0, 1] - - def test_nan_handling(self): - """Test handling of NaN values.""" - X = pd.DataFrame({"text": ["Hello", None, "World"]}) - transformer = TextFeatures(features=["char_count"]) - X_tr = transformer.fit_transform(X) - - # NaN should be filled with empty string, resulting in char_count of 0 - assert X_tr["text_char_count"].tolist() == [5, 0, 5] - - def test_uppercase_features(self): - """Test uppercase-related features.""" - X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) - transformer = TextFeatures( - features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] - assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] - assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] - - def test_sentence_count(self): - """Test sentence counting.""" - X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) - transformer = TextFeatures(features=["sentence_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] - - def test_unique_word_features(self): - """Test unique word features.""" - X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) - transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] - assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] - - def test_invalid_feature_raises_error(self): - """Test that invalid feature name raises ValueError.""" - with pytest.raises(ValueError, match="Invalid features"): - TextFeatures(features=["invalid_feature"]) - - def test_invalid_variables_raises_error(self): - """Test that invalid variables parameter raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): - TextFeatures(variables=123) - - def test_missing_variable_raises_error(self): - """Test that missing variable raises ValueError on fit.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["nonexistent"]) - with pytest.raises(ValueError, match="not present in the dataframe"): - transformer.fit(X) - - def test_no_text_columns_raises_error(self): - """Test that no text columns raises error when variables=None.""" - X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transformer = TextFeatures() - with pytest.raises(ValueError, match="No object/string columns found"): - transformer.fit(X) - - def test_fit_stores_attributes(self): - """Test that fit stores expected attributes.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures() - transformer.fit(X) - - assert hasattr(transformer, "variables_") - assert hasattr(transformer, "features_") - assert hasattr(transformer, "feature_names_in_") - assert hasattr(transformer, "n_features_in_") - - def test_get_feature_names_out(self): - """Test get_feature_names_out returns correct names.""" - X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(features=["char_count", "word_count"]) - transformer.fit(X) - - feature_names = transformer.get_feature_names_out() - assert "text" in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names - assert "text_word_count" in feature_names - - def test_get_feature_names_out_with_drop(self): - """Test get_feature_names_out with drop_original=True.""" - X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures(features=["char_count"], drop_original=True) - transformer.fit(X) - - feature_names = transformer.get_feature_names_out() - assert "text" not in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index fbaac6336..de914fa85 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -4,304 +4,329 @@ from feature_engine.text import TextFeatures -class TestTextFeatures: - """Test cases for TextFeatures transformer.""" +def test_default_all_features(): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures(variables=["text"]) + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns - def test_default_all_features(self): - """Test extracting all features with default parameters.""" - X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) - transformer = TextFeatures(variables=["text"]) - X_tr = transformer.fit_transform(X) - - # Check that new columns were added - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" in X_tr.columns - - # Check char_count - assert X_tr["text_char_count"].tolist() == [12, 10, 2] - - # Check word_count - assert X_tr["text_word_count"].tolist() == [2, 2, 1] - - # Check digit_count - assert X_tr["text_digit_count"].tolist() == [0, 3, 0] - - def test_specific_features(self): - """Test extracting specific features only.""" - X = pd.DataFrame({"text": ["Hello", "World"]}) - transformer = TextFeatures( - variables=["text"], - features=["char_count", "word_count"] - ) - X_tr = transformer.fit_transform(X) - - # Check only specified features are extracted - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" not in X_tr.columns - assert "text_uppercase_count" not in X_tr.columns - - def test_specific_variables(self): - """Test extracting features from specific variables only.""" - X = pd.DataFrame( - {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} - ) - transformer = TextFeatures(variables=["text1"], features=["char_count"]) - X_tr = transformer.fit_transform(X) - - # Only text1 should have features extracted - assert "text1_char_count" in X_tr.columns - assert "text2_char_count" not in X_tr.columns - - def test_drop_original(self): - """Test drop_original parameter.""" - X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) - transformer = TextFeatures( - variables=["text"], - features=["char_count"], - drop_original=True - ) - X_tr = transformer.fit_transform(X) - - assert "text" not in X_tr.columns - assert "text_char_count" in X_tr.columns - assert "other" in X_tr.columns - - def test_empty_string_handling(self): - """Test handling of empty strings.""" - X = pd.DataFrame({"text": ["", "Hello", ""]}) - transformer = TextFeatures( - variables=["text"], features=["char_count", "word_count", "is_empty"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_char_count"].tolist() == [0, 5, 0] - assert X_tr["text_is_empty"].tolist() == [1, 0, 1] - - def test_nan_handling(self): - """Test handling of NaN values.""" - X = pd.DataFrame({"text": ["Hello", None, "World"]}) - transformer = TextFeatures(variables=["text"], features=["char_count"]) - X_tr = transformer.fit_transform(X) - - # NaN should be filled with empty string, resulting in char_count of 0 - assert X_tr["text_char_count"].tolist() == [5, 0, 5] - - def test_letter_count(self): - """Test letter count feature.""" - X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) - transformer = TextFeatures(variables=["text"], features=["letter_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_letter_count"].tolist() == [5, 5, 3] - - def test_uppercase_features(self): - """Test uppercase-related features.""" - X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) - transformer = TextFeatures( - variables=["text"], - features=["uppercase_count", "has_uppercase", "starts_with_uppercase"], - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] - assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] - assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] - - def test_sentence_count(self): - """Test sentence counting.""" - X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) - transformer = TextFeatures(variables=["text"], features=["sentence_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] - - def test_unique_word_features(self): - """Test unique word features.""" - X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) - transformer = TextFeatures( - variables=["text"], features=["unique_word_count", "unique_word_ratio"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] - assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] - - def test_invalid_feature_raises_error(self): - """Test that invalid feature name raises ValueError.""" - with pytest.raises(ValueError, match="Invalid features"): - TextFeatures(variables=["text"], features=["invalid_feature"]) - - def test_invalid_variables_raises_error(self): - """Test that invalid variables parameter raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): - TextFeatures(variables=123) - - def test_missing_variable_raises_error(self): - """Test that missing variable raises ValueError on fit.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["nonexistent"]) - with pytest.raises(ValueError, match="not present in the dataframe"): - transformer.fit(X) - - def test_no_text_columns_raises_error(self): - """Test that no text columns raises error.""" - X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transformer = TextFeatures(variables=["a"]) - with pytest.raises(ValueError, match="not object or string"): - transformer.fit(X) - - def test_fit_stores_attributes(self): - """Test that fit stores expected attributes.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["text"]) - transformer.fit(X) + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + +def test_specific_features(): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures( + variables=["text"], + features=["char_count", "word_count"] + ) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + +def test_specific_variables(): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + +def test_drop_original(): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures( + variables=["text"], + features=["char_count"], + drop_original=True + ) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + +def test_empty_string_handling(): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures( + variables=["text"], features=["char_count", "word_count", "is_empty"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + +def test_nan_handling(): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(variables=["text"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] - assert hasattr(transformer, "variables_") - assert hasattr(transformer, "features_") - assert hasattr(transformer, "feature_names_in_") - assert hasattr(transformer, "n_features_in_") - - def test_get_feature_names_out(self): - """Test get_feature_names_out returns correct names.""" - X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures( - variables=["text"], - features=["char_count", "word_count"] - ) + +def test_letter_count(): + """Test letter count feature.""" + X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) + transformer = TextFeatures(variables=["text"], features=["letter_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_letter_count"].tolist() == [5, 5, 3] + + +def test_uppercase_features(): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + variables=["text"], + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"], + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + +def test_sentence_count(): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(variables=["text"], features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + +def test_unique_word_features(): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures( + variables=["text"], features=["unique_word_count", "lexical_diversity"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_lexical_diversity"].tolist() == [1 / 3, 1.0, 1.0] + + +def test_invalid_feature_raises_error(): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(variables=["text"], features=["invalid_feature"]) + + +def test_invalid_variables_raises_error(): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + +def test_missing_variable_raises_error(): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): transformer.fit(X) - feature_names = transformer.get_feature_names_out() - assert "text" in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names - assert "text_word_count" in feature_names - - def test_get_feature_names_out_with_drop(self): - """Test get_feature_names_out with drop_original=True.""" - X = pd.DataFrame({"text": ["Hello"], "other": [1]}) - transformer = TextFeatures( - variables=["text"], - features=["char_count"], - drop_original=True - ) + +def test_no_text_columns_raises_error(): + """Test that no text columns raises error.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures(variables=["a"]) + with pytest.raises(ValueError, match="not object or string"): transformer.fit(X) - feature_names = transformer.get_feature_names_out() - assert "text" not in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names - - def test_string_variable_input(self): - """Test that passing a single string variable works (auto-converted to list).""" - X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) - transformer = TextFeatures(variables="text", features=["char_count"]) - X_tr = transformer.fit_transform(X) - - assert "text_char_count" in X_tr.columns - assert "other_char_count" not in X_tr.columns - assert X_tr["text_char_count"].tolist() == [5, 5] - - def test_invalid_features_type_raises_error(self): - """Test that invalid features type raises ValueError.""" - with pytest.raises(ValueError, match="features must be"): - TextFeatures(variables=["text"], features="char_count") - - def test_multiple_text_columns(self): - """Test extracting features from multiple text columns.""" - X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) - transformer = TextFeatures( - variables=["a", "b"], - features=["char_count", "word_count"] - ) - X_tr = transformer.fit_transform(X) - - assert "a_char_count" in X_tr.columns - assert "b_char_count" in X_tr.columns - assert "a_word_count" in X_tr.columns - assert "b_word_count" in X_tr.columns - - def test_transform_on_new_data(self): - """Test transform works on new data after fit.""" - X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) - X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) - - transformer = TextFeatures( - variables=["text"], - features=["char_count", "has_digits"] - ) - transformer.fit(X_train) - X_tr = transformer.transform(X_test) - - assert X_tr["text_char_count"].tolist() == [8, 8] - assert X_tr["text_has_digits"].tolist() == [0, 1] - - def test_punctuation_features(self): - """Test punctuation-related features.""" - X = pd.DataFrame({"text": ["Hello.", "World", "Hi!"]}) - transformer = TextFeatures( - variables=["text"], - features=["ends_with_punctuation", "special_char_count"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 1] - assert X_tr["text_special_char_count"].tolist() == [1, 0, 1] - - def test_ratio_features(self): - """Test ratio features with known values.""" - X = pd.DataFrame({"text": ["AB12", "abcd"]}) - transformer = TextFeatures( - variables=["text"], - features=["digit_ratio", "uppercase_ratio", "whitespace_ratio"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_digit_ratio"].tolist() == [0.5, 0.0] - assert X_tr["text_uppercase_ratio"].tolist() == [0.5, 0.0] - assert X_tr["text_whitespace_ratio"].tolist() == [0.0, 0.0] - - def test_avg_word_length(self): - """Test average word length feature.""" - X = pd.DataFrame({"text": ["ab cd", "a"]}) - transformer = TextFeatures(variables=["text"], features=["avg_word_length"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_avg_word_length"].tolist() == [2.0, 1.0] - - def test_lowercase_count(self): - """Test lowercase count feature.""" - X = pd.DataFrame({"text": ["Hello", "WORLD"]}) - transformer = TextFeatures(variables=["text"], features=["lowercase_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_lowercase_count"].tolist() == [4, 0] - - def test_variables_list_non_strings_raises_error(self): - """Test that a list of non-string variables raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): - TextFeatures(variables=[1, 2]) - - def test_features_list_non_strings_raises_error(self): - """Test that a list of non-string features raises ValueError.""" - with pytest.raises(ValueError, match="features must be"): - TextFeatures(variables=["text"], features=[1, 2]) - - def test_more_tags(self): - """Test _more_tags returns expected tags.""" + +def test_fit_stores_attributes(): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["text"]) + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + +def test_get_feature_names_out(): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures( + variables=["text"], + features=["char_count", "word_count"] + ) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + +def test_get_feature_names_out_with_drop(): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures( + variables=["text"], + features=["char_count"], + drop_original=True + ) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + + +def test_string_variable_input(): + """Test that passing a single string variable works (auto-converted to list).""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) + transformer = TextFeatures(variables="text", features=["char_count"]) + X_tr = transformer.fit_transform(X) + + assert "text_char_count" in X_tr.columns + assert "other_char_count" not in X_tr.columns + assert X_tr["text_char_count"].tolist() == [5, 5] + + +def test_invalid_features_type_raises_error(): + """Test that invalid features type raises ValueError.""" + with pytest.raises(ValueError, match="features must be"): + TextFeatures(variables=["text"], features="char_count") + + +def test_multiple_text_columns(): + """Test extracting features from multiple text columns.""" + X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) + transformer = TextFeatures( + variables=["a", "b"], + features=["char_count", "word_count"] + ) + X_tr = transformer.fit_transform(X) + + assert "a_char_count" in X_tr.columns + assert "b_char_count" in X_tr.columns + assert "a_word_count" in X_tr.columns + assert "b_word_count" in X_tr.columns + + +def test_transform_on_new_data(): + """Test transform works on new data after fit.""" + X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) + X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) + + transformer = TextFeatures( + variables=["text"], + features=["char_count", "has_digits"] + ) + transformer.fit(X_train) + X_tr = transformer.transform(X_test) + + assert X_tr["text_char_count"].tolist() == [8, 8] + assert X_tr["text_has_digits"].tolist() == [0, 1] + + +def test_punctuation_features(): + """Test punctuation-related features.""" + X = pd.DataFrame({"text": ["Hello.", "World", "Hi!"]}) + transformer = TextFeatures( + variables=["text"], + features=["ends_with_punctuation", "special_char_count"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 1] + assert X_tr["text_special_char_count"].tolist() == [1, 0, 1] + + +def test_ratio_features(): + """Test ratio features with known values.""" + X = pd.DataFrame({"text": ["AB12", "abcd"]}) + transformer = TextFeatures( + variables=["text"], + features=["digit_ratio", "uppercase_ratio", "whitespace_ratio"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_digit_ratio"].tolist() == [0.5, 0.0] + assert X_tr["text_uppercase_ratio"].tolist() == [0.5, 0.0] + assert X_tr["text_whitespace_ratio"].tolist() == [0.0, 0.0] + + +def test_avg_word_length(): + """Test average word length feature.""" + X = pd.DataFrame({"text": ["ab cd", "a"]}) + transformer = TextFeatures(variables=["text"], features=["avg_word_length"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_avg_word_length"].tolist() == [2.0, 1.0] + + +def test_lowercase_count(): + """Test lowercase count feature.""" + X = pd.DataFrame({"text": ["Hello", "WORLD"]}) + transformer = TextFeatures(variables=["text"], features=["lowercase_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_lowercase_count"].tolist() == [4, 0] + + +def test_variables_list_non_strings_raises_error(): + """Test that a list of non-string variables raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=[1, 2]) + + +def test_features_list_non_strings_raises_error(): + """Test that a list of non-string features raises ValueError.""" + with pytest.raises(ValueError, match="features must be"): + TextFeatures(variables=["text"], features=[1, 2]) + + +def test_more_tags(): + """Test _more_tags returns expected tags.""" + transformer = TextFeatures(variables=["text"]) + tags = transformer._more_tags() + assert tags["allow_nan"] is True + assert tags["variables"] == "categorical" + + +def test_sklearn_tags(): + """Test __sklearn_tags__ returns expected tags.""" + import sklearn + + if hasattr(sklearn, "__version__") and tuple( + int(x) for x in sklearn.__version__.split(".")[:2] + ) >= (1, 6): transformer = TextFeatures(variables=["text"]) - tags = transformer._more_tags() - assert tags["allow_nan"] is True - assert tags["variables"] == "categorical" - - def test_sklearn_tags(self): - """Test __sklearn_tags__ returns expected tags.""" - import sklearn - - if hasattr(sklearn, "__version__") and tuple( - int(x) for x in sklearn.__version__.split(".")[:2] - ) >= (1, 6): - transformer = TextFeatures(variables=["text"]) - tags = transformer.__sklearn_tags__() - assert tags.input_tags.allow_nan is True + tags = transformer.__sklearn_tags__() + assert tags.input_tags.allow_nan is True From 5c8e4d7f406bd14c8f49a5f4808ab6c6196d9bc2 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 16 Feb 2026 09:43:23 -0600 Subject: [PATCH 10/33] Fix Sphinx build: add missing blank line after code-block --- docs/user_guide/text/TextFeatures.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index 659c98570..3111de121 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -223,6 +223,7 @@ The output dataframe contains all 20 text features extracted from the ``review`` 1 4 0.800000 2 8 1.000000 3 4 1.000000 + Dropping original columns ~~~~~~~~~~~~~~~~~~~~~~~~~ From d5d8994567a71942f0d5ede3d87101738bf1bd61 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:21:08 -0600 Subject: [PATCH 11/33] Refactor TextFeatures tests and address reviewer comments --- docs/user_guide/text/TextFeatures.rst | 125 +++--- feature_engine/text/text_features.py | 69 ++-- tests/test_text/test_text_features.py | 567 +++++++++++++++----------- 3 files changed, 425 insertions(+), 336 deletions(-) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index 3111de121..79397b81a 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -9,7 +9,8 @@ Short pieces of text are often found among the variables in our datasets. For ex While text data as such can't be used to train machine learning models, we can extract a lot of numerical information from these texts, which can provide predictive features to train machine learning models. -Feature-engine allows you to quickly extract numerical features from short pieces of text, to complement your predictive models. These features aim to capture a piece of text’s complexity by looking at some statistical parameters of the text, such as the word length and count, the number of words and unique words used, the number of sentences, and so on. :class:`TextFeatures()` extracts many numerical features from text out-of-the-box. +Feature-engine allows you to quickly extract numerical features from short pieces of text, to complement your predictive models. These features aim to capture a piece of text’s complexity by looking at some statistical parameters of the text, such as the word length and count, the number of words and unique words used, the number of sentences, and so on. +:class:`TextFeatures()` extracts many numerical features from text out-of-the-box. TextFeatures ------------ @@ -17,7 +18,7 @@ TextFeatures :class:`TextFeatures()` extracts numerical features from text/string variables. This transformer is useful for extracting basic text statistics that can be used as features in machine learning models. Users must explicitly specify which columns -contain text data via the ``variables`` parameter. +contain text data via the `variables` parameter. Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices, :class:`TextFeatures()` extracts metadata features that remain in DataFrame format @@ -63,8 +64,7 @@ The features **number of unique words** and **lexical diversity** are intended t Handling missing values ----------------------- -By default, :class:`TextFeatures()` raises an error if the variables contain missing values. -This behavior can be changed by setting the parameter ``missing_values`` to ``'ignore'``. +By default, :class:`TextFeatures()` ignores missing values by treating them as empty strings (`missing_values='ignore'`). You can change this behavior by setting the parameter to `'raise'` if you prefer the transformer to raise an error when encountering missing data. In this case, missing values will be treated as empty strings, and the numerical features will be calculated accordingly (e.g., word count and character count will be 0) as shown in the following example: @@ -79,11 +79,10 @@ will be calculated accordingly (e.g., word count and character count will be 0) 'text': ['Hello', np.nan, 'World'] }) - # Set up the transformer to ignore missing values + # Set up the transformer (defaults to ignore missing values) tf = TextFeatures( variables=['text'], - features=['char_count'], - missing_values='ignore' + features=['char_count'] ) # Transform @@ -103,7 +102,8 @@ In the resulting dataframe, we see that the row with NaN returned 0 in the chara Python demo ----------- -Let's create a dataframe with text data and extract features: +In this section, we'll show how to use :class:`TextFeatures()`. +Let's create a dataframe with text data: .. code:: python @@ -149,8 +149,7 @@ Now let's extract 5 specific text features: the number of words, the number of c ) # Fit and transform - tf.fit(X) - X_transformed = tf.transform(X) + X_transformed = tf.fit_transform(X) print(X_transformed) @@ -158,17 +157,17 @@ In the following output, we see the resulting dataframe containing the numerical .. code-block:: none - review title review_word_count review_char_count - 0 This product is AMAZING! Best purchase ever. Great Product 7 45 - 1 Not great. Would not recommend. Disappointed 5 31 - 2 OK for the price. 3 out of 5 stars. Average 8 35 - 3 TERRIBLE!!! DO NOT BUY! Awful 4 23 + review title review_word_count review_char_count + 0 This product is AMAZING! Best purchase ever. Great Product 7 38 + 1 Not great. Would not recommend. Disappointed 5 27 + 2 OK for the price. 3 out of 5 stars. Average 9 27 + 3 TERRIBLE!!! DO NOT BUY! Awful 4 20 review_sentence_count review_has_digits review_uppercase_ratio - 0 2 0 0.066667 - 1 2 0 0.032258 - 2 2 3 0.057143 - 3 2 0 0.608696 + 0 2 0 0.236842 + 1 2 0 0.074074 + 2 2 1 0.074074 + 3 2 0 0.800000 Extracting all features ~~~~~~~~~~~~~~~~~~~~~~~ @@ -179,55 +178,54 @@ By default, if no text features are specified, all available features will be ex # Extract all features from a single text column tf = TextFeatures(variables=['review']) - tf.fit(X) - X_transformed = tf.transform(X) + X_transformed = tf.fit_transform(X) print(X_transformed.head()) -The output dataframe contains all 20 text features extracted from the ``review`` column: +The output dataframe contains all 20 text features extracted from the `review` column: .. code-block:: none - review_char_count review_word_count review_sentence_count review_avg_word_length - 0 45 7 2 5.428571 - 1 31 5 2 5.200000 - 2 35 8 2 3.375000 - 3 23 4 2 4.750000 - - review_digit_count review_letter_count review_uppercase_count review_lowercase_count - 0 0 38 3 35 - 1 0 26 1 25 - 2 2 25 2 23 - 3 0 14 14 0 - - review_special_char_count review_whitespace_count review_whitespace_ratio - 0 1 6 0.133333 - 1 2 4 0.129032 - 2 2 7 0.200000 - 3 3 3 0.130435 - - review_digit_ratio review_uppercase_ratio review_has_digits review_has_uppercase - 0 0.000000 0.066667 0 1 - 1 0.000000 0.032258 0 1 - 2 0.057143 0.057143 1 1 - 3 0.000000 0.608696 0 1 - - review_is_empty review_starts_with_uppercase review_ends_with_punctuation - 0 0 1 1 - 1 0 1 1 - 2 0 1 1 - 3 0 1 1 + review title review_char_count review_word_count + 0 This product is AMAZING! Best purchase ever. Great Product 38 7 + 1 Not great. Would not recommend. Disappointed 27 5 + 2 OK for the price. 3 out of 5 stars. Average 27 9 + 3 TERRIBLE!!! DO NOT BUY! Awful 20 4 + + review_sentence_count review_avg_word_length review_digit_count review_letter_count + 0 2 6.285714 0 36 + 1 2 6.200000 0 25 + 2 2 3.888889 2 23 + 3 2 5.750000 0 16 + + review_uppercase_count review_lowercase_count review_special_char_count review_whitespace_count + 0 9 27 2 6 + 1 2 23 2 4 + 2 2 21 2 8 + 3 16 0 4 3 + + review_whitespace_ratio review_digit_ratio review_uppercase_ratio review_has_digits + 0 0.136364 0.000000 0.236842 0 + 1 0.129032 0.000000 0.074074 0 + 2 0.228571 0.074074 0.074074 1 + 3 0.130435 0.000000 0.800000 0 + + review_has_uppercase review_is_empty review_starts_with_uppercase review_ends_with_punctuation + 0 1 0 1 1 + 1 1 0 1 1 + 2 1 0 1 1 + 3 1 0 1 1 review_unique_word_count review_lexical_diversity - 0 7 1.000000 - 1 4 0.800000 - 2 8 1.000000 - 3 4 1.000000 + 0 7 1.0 + 1 4 1.25 + 2 9 1.0 + 3 4 1.0 Dropping original columns ~~~~~~~~~~~~~~~~~~~~~~~~~ -You can drop the original text columns after extracting features, by setting the parameter ``drop_original`` to ``True``: +You can drop the original text columns after extracting features, by setting the parameter `drop_original` to `True`: .. code:: python @@ -237,25 +235,24 @@ You can drop the original text columns after extracting features, by setting the drop_original=True ) - tf.fit(X) - X_transformed = tf.transform(X) + X_transformed = tf.fit_transform(X) print(X_transformed) -The original ``review`` column has been removed, and only the ``title`` column and the extracted features remain: +The original `'review'` column has been removed, and only the `'title'` column and the extracted features remain: .. code-block:: none title review_word_count review_char_count - 0 Great Product 7 45 - 1 Disappointed 5 31 - 2 Average 8 35 - 3 Awful 4 23 + 0 Great Product 7 38 + 1 Disappointed 5 27 + 2 Average 9 27 + 3 Awful 4 20 Combining with scikit-learn Bag-of-Words ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In most NLP tasks, it is common to use bag-of-words (e.g., ``CountVectorizer``) or TF-IDF (e.g., ``TfidfVectorizer``) to represent the text. :class:`TextFeatures()` can be used alongside these methods to provide additional metadata that might improve model performance. +In most NLP tasks, it is common to use bag-of-words (e.g., `CountVectorizer`) or TF-IDF (e.g., `TfidfVectorizer`) to represent the text. :class:`TextFeatures()` can be used alongside these transformers to provide additional metadata that might improve model performance. In the following example, we compare a baseline model using only TF-IDF with a model that combines TF-IDF and :class:`TextFeatures()` metadata: diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 42602c43e..43f0fe30f 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -10,18 +10,23 @@ from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin from feature_engine._check_init_parameters.check_init_input_params import ( _check_param_drop_original, + _check_param_missing_values, +) +from feature_engine.dataframe_checks import ( + _check_contains_na, + _check_X_matches_training_df, + check_X, ) -from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X from feature_engine.tags import _return_tags # Available text features and their computation functions TEXT_FEATURES = { - "char_count": lambda x: x.str.len(), - "word_count": lambda x: x.str.split().str.len(), + "char_count": lambda x: x.str.replace(r"\s+", "", regex=True).str.len(), + "word_count": lambda x: x.str.strip().str.split().str.len(), "sentence_count": lambda x: x.str.count(r"[.!?]+"), "avg_word_length": lambda x: ( - x.str.replace(r"\s+", "", regex=True).str.len() / - x.str.split().str.len() + x.str.strip().str.len() / + x.str.strip().str.split().str.len() ).fillna(0), "digit_count": lambda x: x.str.count(r"\d"), "letter_count": lambda x: x.str.count(r"[a-zA-Z]"), @@ -30,22 +35,18 @@ "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), "whitespace_count": lambda x: x.str.count(r"\s"), "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), - "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), - "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), "is_empty": lambda x: (x.str.len() == 0).astype(int), "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), "unique_word_count": lambda x: ( - x.str.lower().str.split().apply( - lambda s: len(set(s)) if isinstance(s, list) else 0 - ) + x.str.lower().str.split().apply(set).str.len() ), "lexical_diversity": lambda x: ( - x.str.lower().str.split().apply( - lambda s: len(set(s)) if isinstance(s, list) else 0 - ) / x.str.split().str.len() + x.str.strip().str.split().str.len() / x.str.lower().str.split().apply(set).str.len() ).fillna(0), } @@ -91,6 +92,11 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): If None, extracts all available features. + missing_values: string, default='ignore' + If 'ignore', NaNs will be filled with an empty string before feature + extraction. If 'raise', the transformer will raise an error if missing data + is found. + drop_original: bool, default=False Whether to drop the original text columns after transformation. @@ -140,18 +146,22 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): ... features=['char_count', 'word_count', 'has_digits'] ... ) >>> tf.fit(X) + TextFeatures(features=['char_count', 'word_count', 'has_digits'], + variables=['text']) >>> X = tf.transform(X) - >>> X + >>> pd.options.display.max_columns = 10 + >>> print(X) text text_char_count text_word_count text_has_digits - 0 Hello World! 12 2 0 - 1 Python is GREAT. 16 3 0 - 2 ML rocks 123 12 3 1 + 0 Hello World! 11 2 0 + 1 Python is GREAT. 14 3 0 + 2 ML rocks 123 10 3 1 """ def __init__( self, variables: Union[str, List[str]], features: Optional[List[str]] = None, + missing_values: str = "ignore", drop_original: bool = False, ) -> None: @@ -183,28 +193,28 @@ def __init__( ) _check_param_drop_original(drop_original) + _check_param_missing_values(missing_values) self.variables = variables self.features = features + self.missing_values = missing_values self.drop_original = drop_original def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ + This transformer does not learn any parameters. + Stores feature names and validates that the specified variables are - present and are of string/object type. + present. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] - The training input samples. + The training input samples. Can be the entire dataframe, not just the + variables to transform. y: pandas Series, or np.array. Defaults to None. It is not needed in this transformer. You can pass y or None. - - Returns - ------- - self: TextFeatures - The fitted transformer. """ # check input dataframe @@ -232,6 +242,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.variables_ = self.variables + # check if dataset contains na + if self.missing_values == "raise": + _check_contains_na(X, self.variables_) + # Set features to extract if self.features is None: self.features_ = list(TEXT_FEATURES.keys()) @@ -270,10 +284,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) + # check if dataset contains na + if self.missing_values == "raise": + _check_contains_na(X, self.variables_) + # reorder variables to match train set X = X[self.feature_names_in_] # Fill NaN with empty string for feature extraction + # This is safe because if missing_values is 'raise', it would have + # raised an error above. So any remaining NaNs are either intended to + # be filled or there are none. X[self.variables_] = X[self.variables_].fillna("") # Extract features for each text variable diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index de914fa85..7481dd813 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -2,7 +2,121 @@ import pytest from feature_engine.text import TextFeatures +from feature_engine.text.text_features import TEXT_FEATURES + +# ============================================================================== +# INIT TESTS +# ============================================================================== + +@pytest.mark.parametrize( + "invalid_variables", + [ + 123, + True, + [1, 2], + ["text", 123], + {"text": 1}, + ], +) +def test_invalid_variables_raises_error(invalid_variables): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=invalid_variables) + +@pytest.mark.parametrize( + "invalid_features, err_msg", + [ + ("some_string", "features must be"), + ([1, 2], "features must be"), + (123, "features must be"), + (True, "features must be"), + (["some_string", True], "features must be"), + ({"some_string": 1}, "features must be"), + (["invalid_feature"], "Invalid features"), + (["char_count", "invalid_feature"], "Invalid features"), + ], +) +def test_invalid_features_raises_error(invalid_features, err_msg): + """Test that invalid features parameter raises ValueError.""" + with pytest.raises(ValueError, match=err_msg): + TextFeatures(variables=["text"], features=invalid_features) + +# ============================================================================== +# FIT TESTS +# ============================================================================== + +def test_fit_stores_attributes(): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["text"]) + transformer.fit(X) + + assert transformer.variables_ == ["text"] + assert transformer.features_ == list(TEXT_FEATURES.keys()) + assert transformer.feature_names_in_ == ["text"] + assert transformer.n_features_in_ == 1 + +def test_missing_variable_raises_error(): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) +def test_no_text_columns_raises_error(): + """Test that no text columns raises error.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures(variables=["a"]) + with pytest.raises(ValueError, match="not object or string"): + transformer.fit(X) + +def test_nan_handling_raise_error_fit(): + """Test handling of NaN values when missing_values is 'raise' on fit.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures( + variables=["text"], features=["char_count"], missing_values="raise" + ) + with pytest.raises(ValueError): + transformer.fit(X) + +# ============================================================================== +# TRANSFORM TESTS - GENERAL +# ============================================================================== + +def test_transform_on_new_data(): + """Test transform works on new data after fit.""" + X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) + X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) + + transformer = TextFeatures( + variables=["text"], + features=["char_count", "has_digits"] + ) + transformer.fit(X_train) + X_tr = transformer.transform(X_test) + + assert X_tr["text_char_count"].tolist() == [7, 7] + assert X_tr["text_has_digits"].tolist() == [0, 1] + +def test_nan_handling_raise_error_transform(): + """Test handling of NaN values when missing_values is 'raise' on transform.""" + X_train = pd.DataFrame({"text": ["Hello", "World"]}) + X_test = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures( + variables=["text"], features=["char_count"], missing_values="raise" + ) + transformer.fit(X_train) + with pytest.raises(ValueError): + transformer.transform(X_test) + +def test_nan_handling(): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(variables=["text"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] def test_default_all_features(): """Test extracting all features with default parameters.""" @@ -10,21 +124,11 @@ def test_default_all_features(): transformer = TextFeatures(variables=["text"]) X_tr = transformer.fit_transform(X) - # Check that new columns were added - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" in X_tr.columns - - # Check char_count - assert X_tr["text_char_count"].tolist() == [12, 10, 2] - - # Check word_count + # Spot check a few features to ensure they were added and computed + assert X_tr["text_char_count"].tolist() == [11, 9, 2] assert X_tr["text_word_count"].tolist() == [2, 2, 1] - - # Check digit_count assert X_tr["text_digit_count"].tolist() == [0, 3, 0] - def test_specific_features(): """Test extracting specific features only.""" X = pd.DataFrame({"text": ["Hello", "World"]}) @@ -35,11 +139,7 @@ def test_specific_features(): X_tr = transformer.fit_transform(X) # Check only specified features are extracted - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" not in X_tr.columns - assert "text_uppercase_count" not in X_tr.columns - + assert X_tr.columns.tolist() == ["text", "text_char_count", "text_word_count"] def test_specific_variables(): """Test extracting features from specific variables only.""" @@ -50,9 +150,7 @@ def test_specific_variables(): X_tr = transformer.fit_transform(X) # Only text1 should have features extracted - assert "text1_char_count" in X_tr.columns - assert "text2_char_count" not in X_tr.columns - + assert X_tr.columns.tolist() == ["text1", "text2", "numeric", "text1_char_count"] def test_drop_original(): """Test drop_original parameter.""" @@ -64,116 +162,218 @@ def test_drop_original(): ) X_tr = transformer.fit_transform(X) - assert "text" not in X_tr.columns - assert "text_char_count" in X_tr.columns - assert "other" in X_tr.columns + assert X_tr.columns.tolist() == ["other", "text_char_count"] +def test_string_variable_input(): + """Test that passing a single string variable works (auto-converted to list).""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) + transformer = TextFeatures(variables="text", features=["char_count"]) + X_tr = transformer.fit_transform(X) + + assert transformer.variables_ == ["text"] + assert X_tr.columns.tolist() == ["text", "other", "text_char_count"] + assert X_tr["text_char_count"].tolist() == [5, 5] -def test_empty_string_handling(): - """Test handling of empty strings.""" - X = pd.DataFrame({"text": ["", "Hello", ""]}) +def test_multiple_text_columns(): + """Test extracting features from multiple text columns.""" + X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) transformer = TextFeatures( - variables=["text"], features=["char_count", "word_count", "is_empty"] + variables=["a", "b"], + features=["char_count", "word_count"] ) X_tr = transformer.fit_transform(X) - assert X_tr["text_char_count"].tolist() == [0, 5, 0] - assert X_tr["text_is_empty"].tolist() == [1, 0, 1] - - -def test_nan_handling(): - """Test handling of NaN values.""" - X = pd.DataFrame({"text": ["Hello", None, "World"]}) - transformer = TextFeatures(variables=["text"], features=["char_count"]) + assert X_tr.columns.tolist() == ["a", "b", "a_char_count", "a_word_count", "b_char_count", "b_word_count"] + +# ============================================================================== +# TRANSFORM TESTS - INDIVIDUAL FEATURES +# ============================================================================== + +def test_whitespace_features(): + """Test whitespace_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["whitespace_count", "whitespace_ratio"]) X_tr = transformer.fit_transform(X) - - # NaN should be filled with empty string, resulting in char_count of 0 - assert X_tr["text_char_count"].tolist() == [5, 0, 5] - - -def test_letter_count(): - """Test letter count feature.""" - X = pd.DataFrame({"text": ["Hello 123", "WORLD!", "abc..."]}) - transformer = TextFeatures(variables=["text"], features=["letter_count"]) + assert X_tr["text_whitespace_count"].tolist() == [1, 0, 0, 1, 3, 2, 0, 0, 0, 2, 0, 1, 1, 0, 3, 5, 1, 1, 1, 10] + assert X_tr["text_whitespace_ratio"].tolist() == [0.08333333333333333, 0.0, 0.0, 0.1111111111111111, 1.0, 0.2, 0.0, 0.0, 0.0, 0.25, 0.0, 0.16666666666666666, 0.07692307692307693, 0.0, 0.15789473684210525, 0.2631578947368421, 0.08333333333333333, 0.07692307692307693, 0.058823529411764705, 0.09900990099009901] + +def test_digit_features(): + """Test digit_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["digit_count", "digit_ratio", "has_digits"]) X_tr = transformer.fit_transform(X) - - assert X_tr["text_letter_count"].tolist() == [5, 5, 3] - + assert X_tr["text_digit_count"].tolist() == [0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 5, 0, 0, 0, 0] + assert X_tr["text_digit_ratio"].tolist() == [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.35714285714285715, 0.0, 0.0, 0.0, 0.0] + assert X_tr["text_has_digits"].tolist() == [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] def test_uppercase_features(): - """Test uppercase-related features.""" - X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) - transformer = TextFeatures( - variables=["text"], - features=["uppercase_count", "has_uppercase", "starts_with_uppercase"], - ) + """Test uppercase_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["uppercase_count", "uppercase_ratio", "has_uppercase", "starts_with_uppercase"]) X_tr = transformer.fit_transform(X) + assert X_tr["text_uppercase_count"].tolist() == [2, 5, 0, 0, 0, 0, 0, 0, 0, 3, 3, 1, 2, 0, 0, 2, 2, 2, 2, 1] + assert X_tr["text_uppercase_ratio"].tolist() == [0.18181818181818182, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.6, 0.2, 0.16666666666666666, 0.0, 0.0, 0.14285714285714285, 0.18181818181818182, 0.16666666666666666, 0.125, 0.01098901098901099] + assert X_tr["text_has_uppercase"].tolist() == [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1] - assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] - assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] - assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] - - -def test_sentence_count(): - """Test sentence counting.""" - X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) - transformer = TextFeatures(variables=["text"], features=["sentence_count"]) +def test_punctuation_features(): + """Test punctuation_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["special_char_count", "ends_with_punctuation"]) X_tr = transformer.fit_transform(X) - - assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] - - -def test_unique_word_features(): - """Test unique word features.""" - X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) - transformer = TextFeatures( - variables=["text"], features=["unique_word_count", "lexical_diversity"] - ) + assert X_tr["text_special_char_count"].tolist() == [1, 0, 0, 4, 0, 0, 3, 0, 0, 3, 0, 3, 4, 3, 3, 5, 1, 2, 6, 1] + assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1] + +def test_word_features(): + """Test word_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["word_count", "unique_word_count", "lexical_diversity", "avg_word_length"]) X_tr = transformer.fit_transform(X) + assert X_tr["text_word_count"].tolist() == [2, 1, 1, 2, 0, 1, 1, 0, 0, 3, 1, 2, 2, 1, 4, 6, 2, 2, 2, 11] + assert X_tr["text_unique_word_count"].tolist() == [2, 1, 1, 2, 0, 1, 1, 0, 0, 3, 1, 2, 2, 1, 4, 6, 2, 2, 2, 11] + assert X_tr["text_lexical_diversity"].tolist() == [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + assert X_tr["text_avg_word_length"].tolist() == [6.0, 5.0, 5.0, 4.5, 0.0, 8.0, 6.0, 0.0, 0.0, 2.6666666666666665, 5.0, 3.0, 6.5, 3.0, 4.75, 3.1666666666666665, 6.0, 6.5, 8.5, 9.181818181818182] + +def test_basic_features(): + """Test basic_features.""" + X = pd.DataFrame({"text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ]}) + transformer = TextFeatures(variables=["text"], features=["char_count", "sentence_count", "letter_count", "lowercase_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + assert X_tr["text_char_count"].tolist() == [11, 5, 5, 8, 0, 8, 6, 0, 0, 6, 5, 5, 12, 3, 16, 14, 11, 12, 16, 91] + assert X_tr["text_sentence_count"].tolist() == [1, 0, 0, 4, 0, 0, 1, 0, 0, 3, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1] + assert X_tr["text_letter_count"].tolist() == [10, 5, 0, 4, 0, 8, 3, 0, 0, 3, 5, 2, 4, 0, 13, 4, 10, 10, 10, 90] + assert X_tr["text_lowercase_count"].tolist() == [8, 0, 0, 4, 0, 8, 3, 0, 0, 0, 2, 1, 2, 0, 13, 2, 8, 8, 8, 89] + assert X_tr["text_is_empty"].tolist() == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] - assert X_tr["text_lexical_diversity"].tolist() == [1 / 3, 1.0, 1.0] - - -def test_invalid_feature_raises_error(): - """Test that invalid feature name raises ValueError.""" - with pytest.raises(ValueError, match="Invalid features"): - TextFeatures(variables=["text"], features=["invalid_feature"]) - - -def test_invalid_variables_raises_error(): - """Test that invalid variables parameter raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): - TextFeatures(variables=123) - - -def test_missing_variable_raises_error(): - """Test that missing variable raises ValueError on fit.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["nonexistent"]) - with pytest.raises(ValueError, match="not present in the dataframe"): - transformer.fit(X) - - -def test_no_text_columns_raises_error(): - """Test that no text columns raises error.""" - X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transformer = TextFeatures(variables=["a"]) - with pytest.raises(ValueError, match="not object or string"): - transformer.fit(X) - - -def test_fit_stores_attributes(): - """Test that fit stores expected attributes.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["text"]) - transformer.fit(X) - - assert hasattr(transformer, "variables_") - assert hasattr(transformer, "features_") - assert hasattr(transformer, "feature_names_in_") - assert hasattr(transformer, "n_features_in_") - +# ============================================================================== +# OTHER METHOD TESTS +# ============================================================================== def test_get_feature_names_out(): """Test get_feature_names_out returns correct names.""" @@ -185,11 +385,8 @@ def test_get_feature_names_out(): transformer.fit(X) feature_names = transformer.get_feature_names_out() - assert "text" in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names - assert "text_word_count" in feature_names - + expected_features = ["text", "other", "text_char_count", "text_word_count"] + assert feature_names == expected_features def test_get_feature_names_out_with_drop(): """Test get_feature_names_out with drop_original=True.""" @@ -202,131 +399,5 @@ def test_get_feature_names_out_with_drop(): transformer.fit(X) feature_names = transformer.get_feature_names_out() - assert "text" not in feature_names - assert "other" in feature_names - assert "text_char_count" in feature_names - - -def test_string_variable_input(): - """Test that passing a single string variable works (auto-converted to list).""" - X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) - transformer = TextFeatures(variables="text", features=["char_count"]) - X_tr = transformer.fit_transform(X) - - assert "text_char_count" in X_tr.columns - assert "other_char_count" not in X_tr.columns - assert X_tr["text_char_count"].tolist() == [5, 5] - - -def test_invalid_features_type_raises_error(): - """Test that invalid features type raises ValueError.""" - with pytest.raises(ValueError, match="features must be"): - TextFeatures(variables=["text"], features="char_count") - - -def test_multiple_text_columns(): - """Test extracting features from multiple text columns.""" - X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) - transformer = TextFeatures( - variables=["a", "b"], - features=["char_count", "word_count"] - ) - X_tr = transformer.fit_transform(X) - - assert "a_char_count" in X_tr.columns - assert "b_char_count" in X_tr.columns - assert "a_word_count" in X_tr.columns - assert "b_word_count" in X_tr.columns - - -def test_transform_on_new_data(): - """Test transform works on new data after fit.""" - X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) - X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) - - transformer = TextFeatures( - variables=["text"], - features=["char_count", "has_digits"] - ) - transformer.fit(X_train) - X_tr = transformer.transform(X_test) - - assert X_tr["text_char_count"].tolist() == [8, 8] - assert X_tr["text_has_digits"].tolist() == [0, 1] - - -def test_punctuation_features(): - """Test punctuation-related features.""" - X = pd.DataFrame({"text": ["Hello.", "World", "Hi!"]}) - transformer = TextFeatures( - variables=["text"], - features=["ends_with_punctuation", "special_char_count"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 1] - assert X_tr["text_special_char_count"].tolist() == [1, 0, 1] - - -def test_ratio_features(): - """Test ratio features with known values.""" - X = pd.DataFrame({"text": ["AB12", "abcd"]}) - transformer = TextFeatures( - variables=["text"], - features=["digit_ratio", "uppercase_ratio", "whitespace_ratio"] - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_digit_ratio"].tolist() == [0.5, 0.0] - assert X_tr["text_uppercase_ratio"].tolist() == [0.5, 0.0] - assert X_tr["text_whitespace_ratio"].tolist() == [0.0, 0.0] - - -def test_avg_word_length(): - """Test average word length feature.""" - X = pd.DataFrame({"text": ["ab cd", "a"]}) - transformer = TextFeatures(variables=["text"], features=["avg_word_length"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_avg_word_length"].tolist() == [2.0, 1.0] - - -def test_lowercase_count(): - """Test lowercase count feature.""" - X = pd.DataFrame({"text": ["Hello", "WORLD"]}) - transformer = TextFeatures(variables=["text"], features=["lowercase_count"]) - X_tr = transformer.fit_transform(X) - - assert X_tr["text_lowercase_count"].tolist() == [4, 0] - - -def test_variables_list_non_strings_raises_error(): - """Test that a list of non-string variables raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): - TextFeatures(variables=[1, 2]) - - -def test_features_list_non_strings_raises_error(): - """Test that a list of non-string features raises ValueError.""" - with pytest.raises(ValueError, match="features must be"): - TextFeatures(variables=["text"], features=[1, 2]) - - -def test_more_tags(): - """Test _more_tags returns expected tags.""" - transformer = TextFeatures(variables=["text"]) - tags = transformer._more_tags() - assert tags["allow_nan"] is True - assert tags["variables"] == "categorical" - - -def test_sklearn_tags(): - """Test __sklearn_tags__ returns expected tags.""" - import sklearn - - if hasattr(sklearn, "__version__") and tuple( - int(x) for x in sklearn.__version__.split(".")[:2] - ) >= (1, 6): - transformer = TextFeatures(variables=["text"]) - tags = transformer.__sklearn_tags__() - assert tags.input_tags.allow_nan is True + expected_features = ["other", "text_char_count"] + assert feature_names == expected_features From a049233f13cbe7540df66323946cefdefe138a1e Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:23:50 -0600 Subject: [PATCH 12/33] Fix mypy invariant type error for check_contains_na --- feature_engine/text/text_features.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 43f0fe30f..7811fb94a 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -1,7 +1,6 @@ # Authors: Ankit Hemant Lade (contributor) # License: BSD 3 clause - -from typing import List, Optional, Union +from typing import List, Optional, Union, cast import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin @@ -244,7 +243,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check if dataset contains na if self.missing_values == "raise": - _check_contains_na(X, self.variables_) + _check_contains_na(X, cast(list[Union[str, int]], self.variables_)) # Set features to extract if self.features is None: @@ -286,7 +285,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if dataset contains na if self.missing_values == "raise": - _check_contains_na(X, self.variables_) + _check_contains_na(X, cast(list[Union[str, int]], self.variables_)) # reorder variables to match train set X = X[self.feature_names_in_] From 91fd84086a80bffc06b652ebe7600e3dfc5c6965 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:25:30 -0600 Subject: [PATCH 13/33] Autoformat files using black to resolve flake8 line-length constraints --- feature_engine/text/text_features.py | 18 +- tests/test_text/test_text_features.py | 847 ++++++++++++++++++++------ 2 files changed, 683 insertions(+), 182 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 7811fb94a..bd1ebe92a 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -24,8 +24,7 @@ "word_count": lambda x: x.str.strip().str.split().str.len(), "sentence_count": lambda x: x.str.count(r"[.!?]+"), "avg_word_length": lambda x: ( - x.str.strip().str.len() / - x.str.strip().str.split().str.len() + x.str.strip().str.len() / x.str.strip().str.split().str.len() ).fillna(0), "digit_count": lambda x: x.str.count(r"\d"), "letter_count": lambda x: x.str.count(r"[a-zA-Z]"), @@ -34,18 +33,19 @@ "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), "whitespace_count": lambda x: x.str.count(r"\s"), "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), - "digit_ratio": lambda x: x.str.count(r"\d") / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), - "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") + / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") + / x.str.replace(r"\s+", "", regex=True).str.len().replace(0, 1), "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), "is_empty": lambda x: (x.str.len() == 0).astype(int), "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), - "unique_word_count": lambda x: ( - x.str.lower().str.split().apply(set).str.len() - ), + "unique_word_count": lambda x: (x.str.lower().str.split().apply(set).str.len()), "lexical_diversity": lambda x: ( - x.str.strip().str.split().str.len() / x.str.lower().str.split().apply(set).str.len() + x.str.strip().str.split().str.len() + / x.str.lower().str.split().apply(set).str.len() ).fillna(0), } @@ -291,7 +291,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X[self.feature_names_in_] # Fill NaN with empty string for feature extraction - # This is safe because if missing_values is 'raise', it would have + # This is safe because if missing_values is 'raise', it would have # raised an error above. So any remaining NaNs are either intended to # be filled or there are none. X[self.variables_] = X[self.variables_].fillna("") diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 7481dd813..d176b3c39 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -8,6 +8,7 @@ # INIT TESTS # ============================================================================== + @pytest.mark.parametrize( "invalid_variables", [ @@ -23,6 +24,7 @@ def test_invalid_variables_raises_error(invalid_variables): with pytest.raises(ValueError, match="variables must be"): TextFeatures(variables=invalid_variables) + @pytest.mark.parametrize( "invalid_features, err_msg", [ @@ -41,10 +43,12 @@ def test_invalid_features_raises_error(invalid_features, err_msg): with pytest.raises(ValueError, match=err_msg): TextFeatures(variables=["text"], features=invalid_features) + # ============================================================================== # FIT TESTS # ============================================================================== + def test_fit_stores_attributes(): """Test that fit stores expected attributes.""" X = pd.DataFrame({"text": ["Hello"]}) @@ -56,6 +60,7 @@ def test_fit_stores_attributes(): assert transformer.feature_names_in_ == ["text"] assert transformer.n_features_in_ == 1 + def test_missing_variable_raises_error(): """Test that missing variable raises ValueError on fit.""" X = pd.DataFrame({"text": ["Hello"]}) @@ -63,6 +68,7 @@ def test_missing_variable_raises_error(): with pytest.raises(ValueError, match="not present in the dataframe"): transformer.fit(X) + def test_no_text_columns_raises_error(): """Test that no text columns raises error.""" X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -70,6 +76,7 @@ def test_no_text_columns_raises_error(): with pytest.raises(ValueError, match="not object or string"): transformer.fit(X) + def test_nan_handling_raise_error_fit(): """Test handling of NaN values when missing_values is 'raise' on fit.""" X = pd.DataFrame({"text": ["Hello", None, "World"]}) @@ -79,18 +86,19 @@ def test_nan_handling_raise_error_fit(): with pytest.raises(ValueError): transformer.fit(X) + # ============================================================================== # TRANSFORM TESTS - GENERAL # ============================================================================== + def test_transform_on_new_data(): """Test transform works on new data after fit.""" X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) transformer = TextFeatures( - variables=["text"], - features=["char_count", "has_digits"] + variables=["text"], features=["char_count", "has_digits"] ) transformer.fit(X_train) X_tr = transformer.transform(X_test) @@ -98,6 +106,7 @@ def test_transform_on_new_data(): assert X_tr["text_char_count"].tolist() == [7, 7] assert X_tr["text_has_digits"].tolist() == [0, 1] + def test_nan_handling_raise_error_transform(): """Test handling of NaN values when missing_values is 'raise' on transform.""" X_train = pd.DataFrame({"text": ["Hello", "World"]}) @@ -109,6 +118,7 @@ def test_nan_handling_raise_error_transform(): with pytest.raises(ValueError): transformer.transform(X_test) + def test_nan_handling(): """Test handling of NaN values.""" X = pd.DataFrame({"text": ["Hello", None, "World"]}) @@ -118,6 +128,7 @@ def test_nan_handling(): # NaN should be filled with empty string, resulting in char_count of 0 assert X_tr["text_char_count"].tolist() == [5, 0, 5] + def test_default_all_features(): """Test extracting all features with default parameters.""" X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) @@ -129,18 +140,19 @@ def test_default_all_features(): assert X_tr["text_word_count"].tolist() == [2, 2, 1] assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + def test_specific_features(): """Test extracting specific features only.""" X = pd.DataFrame({"text": ["Hello", "World"]}) transformer = TextFeatures( - variables=["text"], - features=["char_count", "word_count"] + variables=["text"], features=["char_count", "word_count"] ) X_tr = transformer.fit_transform(X) # Check only specified features are extracted assert X_tr.columns.tolist() == ["text", "text_char_count", "text_word_count"] + def test_specific_variables(): """Test extracting features from specific variables only.""" X = pd.DataFrame( @@ -152,18 +164,18 @@ def test_specific_variables(): # Only text1 should have features extracted assert X_tr.columns.tolist() == ["text1", "text2", "numeric", "text1_char_count"] + def test_drop_original(): """Test drop_original parameter.""" X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) transformer = TextFeatures( - variables=["text"], - features=["char_count"], - drop_original=True + variables=["text"], features=["char_count"], drop_original=True ) X_tr = transformer.fit_transform(X) assert X_tr.columns.tolist() == ["other", "text_char_count"] + def test_string_variable_input(): """Test that passing a single string variable works (auto-converted to list).""" X = pd.DataFrame({"text": ["Hello", "World"], "other": ["A", "B"]}) @@ -174,213 +186,703 @@ def test_string_variable_input(): assert X_tr.columns.tolist() == ["text", "other", "text_char_count"] assert X_tr["text_char_count"].tolist() == [5, 5] + def test_multiple_text_columns(): """Test extracting features from multiple text columns.""" X = pd.DataFrame({"a": ["Hello", "World"], "b": ["Foo", "Bar"]}) transformer = TextFeatures( - variables=["a", "b"], - features=["char_count", "word_count"] + variables=["a", "b"], features=["char_count", "word_count"] ) X_tr = transformer.fit_transform(X) - assert X_tr.columns.tolist() == ["a", "b", "a_char_count", "a_word_count", "b_char_count", "b_word_count"] + assert X_tr.columns.tolist() == [ + "a", + "b", + "a_char_count", + "a_word_count", + "b_char_count", + "b_word_count", + ] + # ============================================================================== # TRANSFORM TESTS - INDIVIDUAL FEATURES # ============================================================================== + def test_whitespace_features(): """Test whitespace_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["whitespace_count", "whitespace_ratio"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], features=["whitespace_count", "whitespace_ratio"] + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_whitespace_count"].tolist() == [1, 0, 0, 1, 3, 2, 0, 0, 0, 2, 0, 1, 1, 0, 3, 5, 1, 1, 1, 10] - assert X_tr["text_whitespace_ratio"].tolist() == [0.08333333333333333, 0.0, 0.0, 0.1111111111111111, 1.0, 0.2, 0.0, 0.0, 0.0, 0.25, 0.0, 0.16666666666666666, 0.07692307692307693, 0.0, 0.15789473684210525, 0.2631578947368421, 0.08333333333333333, 0.07692307692307693, 0.058823529411764705, 0.09900990099009901] + assert X_tr["text_whitespace_count"].tolist() == [ + 1, + 0, + 0, + 1, + 3, + 2, + 0, + 0, + 0, + 2, + 0, + 1, + 1, + 0, + 3, + 5, + 1, + 1, + 1, + 10, + ] + assert X_tr["text_whitespace_ratio"].tolist() == [ + 0.08333333333333333, + 0.0, + 0.0, + 0.1111111111111111, + 1.0, + 0.2, + 0.0, + 0.0, + 0.0, + 0.25, + 0.0, + 0.16666666666666666, + 0.07692307692307693, + 0.0, + 0.15789473684210525, + 0.2631578947368421, + 0.08333333333333333, + 0.07692307692307693, + 0.058823529411764705, + 0.09900990099009901, + ] + def test_digit_features(): """Test digit_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["digit_count", "digit_ratio", "has_digits"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], features=["digit_count", "digit_ratio", "has_digits"] + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_digit_count"].tolist() == [0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 5, 0, 0, 0, 0] - assert X_tr["text_digit_ratio"].tolist() == [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.35714285714285715, 0.0, 0.0, 0.0, 0.0] - assert X_tr["text_has_digits"].tolist() == [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] + assert X_tr["text_digit_count"].tolist() == [ + 0, + 0, + 5, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 4, + 0, + 0, + 5, + 0, + 0, + 0, + 0, + ] + assert X_tr["text_digit_ratio"].tolist() == [ + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.3333333333333333, + 0.0, + 0.0, + 0.35714285714285715, + 0.0, + 0.0, + 0.0, + 0.0, + ] + assert X_tr["text_has_digits"].tolist() == [ + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + ] + def test_uppercase_features(): """Test uppercase_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["uppercase_count", "uppercase_ratio", "has_uppercase", "starts_with_uppercase"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], + features=[ + "uppercase_count", + "uppercase_ratio", + "has_uppercase", + "starts_with_uppercase", + ], + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_uppercase_count"].tolist() == [2, 5, 0, 0, 0, 0, 0, 0, 0, 3, 3, 1, 2, 0, 0, 2, 2, 2, 2, 1] - assert X_tr["text_uppercase_ratio"].tolist() == [0.18181818181818182, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.6, 0.2, 0.16666666666666666, 0.0, 0.0, 0.14285714285714285, 0.18181818181818182, 0.16666666666666666, 0.125, 0.01098901098901099] - assert X_tr["text_has_uppercase"].tolist() == [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1] - assert X_tr["text_starts_with_uppercase"].tolist() == [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1] + assert X_tr["text_uppercase_count"].tolist() == [ + 2, + 5, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 1, + 2, + 0, + 0, + 2, + 2, + 2, + 2, + 1, + ] + assert X_tr["text_uppercase_ratio"].tolist() == [ + 0.18181818181818182, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.5, + 0.6, + 0.2, + 0.16666666666666666, + 0.0, + 0.0, + 0.14285714285714285, + 0.18181818181818182, + 0.16666666666666666, + 0.125, + 0.01098901098901099, + ] + assert X_tr["text_has_uppercase"].tolist() == [ + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + ] + assert X_tr["text_starts_with_uppercase"].tolist() == [ + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + ] + def test_punctuation_features(): """Test punctuation_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["special_char_count", "ends_with_punctuation"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], features=["special_char_count", "ends_with_punctuation"] + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_special_char_count"].tolist() == [1, 0, 0, 4, 0, 0, 3, 0, 0, 3, 0, 3, 4, 3, 3, 5, 1, 2, 6, 1] - assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1] + assert X_tr["text_special_char_count"].tolist() == [ + 1, + 0, + 0, + 4, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 3, + 4, + 3, + 3, + 5, + 1, + 2, + 6, + 1, + ] + assert X_tr["text_ends_with_punctuation"].tolist() == [ + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 1, + 0, + 1, + 0, + 1, + 1, + 1, + ] + def test_word_features(): """Test word_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["word_count", "unique_word_count", "lexical_diversity", "avg_word_length"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], + features=[ + "word_count", + "unique_word_count", + "lexical_diversity", + "avg_word_length", + ], + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_word_count"].tolist() == [2, 1, 1, 2, 0, 1, 1, 0, 0, 3, 1, 2, 2, 1, 4, 6, 2, 2, 2, 11] - assert X_tr["text_unique_word_count"].tolist() == [2, 1, 1, 2, 0, 1, 1, 0, 0, 3, 1, 2, 2, 1, 4, 6, 2, 2, 2, 11] - assert X_tr["text_lexical_diversity"].tolist() == [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - assert X_tr["text_avg_word_length"].tolist() == [6.0, 5.0, 5.0, 4.5, 0.0, 8.0, 6.0, 0.0, 0.0, 2.6666666666666665, 5.0, 3.0, 6.5, 3.0, 4.75, 3.1666666666666665, 6.0, 6.5, 8.5, 9.181818181818182] + assert X_tr["text_word_count"].tolist() == [ + 2, + 1, + 1, + 2, + 0, + 1, + 1, + 0, + 0, + 3, + 1, + 2, + 2, + 1, + 4, + 6, + 2, + 2, + 2, + 11, + ] + assert X_tr["text_unique_word_count"].tolist() == [ + 2, + 1, + 1, + 2, + 0, + 1, + 1, + 0, + 0, + 3, + 1, + 2, + 2, + 1, + 4, + 6, + 2, + 2, + 2, + 11, + ] + assert X_tr["text_lexical_diversity"].tolist() == [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + assert X_tr["text_avg_word_length"].tolist() == [ + 6.0, + 5.0, + 5.0, + 4.5, + 0.0, + 8.0, + 6.0, + 0.0, + 0.0, + 2.6666666666666665, + 5.0, + 3.0, + 6.5, + 3.0, + 4.75, + 3.1666666666666665, + 6.0, + 6.5, + 8.5, + 9.181818181818182, + ] + def test_basic_features(): """Test basic_features.""" - X = pd.DataFrame({"text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", - ]}) - transformer = TextFeatures(variables=["text"], features=["char_count", "sentence_count", "letter_count", "lowercase_count", "is_empty"]) + X = pd.DataFrame( + { + "text": [ + "Hello World!", + "HELLO", + "12345", + "e.g. i.e.", + " ", + " trailing ", + "abc...", + "", + None, + "A? B! C.", + "HeLLo", + "Hi! @#", + "A1b2 C3d4!@#$", + "???", + "i.e., this is wrong", + "Is 1 > 2? No, 100%!", + "Hello. World", + "Hello. World.", + "Hello... World!?!", + "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + ] + } + ) + transformer = TextFeatures( + variables=["text"], + features=[ + "char_count", + "sentence_count", + "letter_count", + "lowercase_count", + "is_empty", + ], + ) X_tr = transformer.fit_transform(X) - assert X_tr["text_char_count"].tolist() == [11, 5, 5, 8, 0, 8, 6, 0, 0, 6, 5, 5, 12, 3, 16, 14, 11, 12, 16, 91] - assert X_tr["text_sentence_count"].tolist() == [1, 0, 0, 4, 0, 0, 1, 0, 0, 3, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1] - assert X_tr["text_letter_count"].tolist() == [10, 5, 0, 4, 0, 8, 3, 0, 0, 3, 5, 2, 4, 0, 13, 4, 10, 10, 10, 90] - assert X_tr["text_lowercase_count"].tolist() == [8, 0, 0, 4, 0, 8, 3, 0, 0, 0, 2, 1, 2, 0, 13, 2, 8, 8, 8, 89] - assert X_tr["text_is_empty"].tolist() == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + assert X_tr["text_char_count"].tolist() == [ + 11, + 5, + 5, + 8, + 0, + 8, + 6, + 0, + 0, + 6, + 5, + 5, + 12, + 3, + 16, + 14, + 11, + 12, + 16, + 91, + ] + assert X_tr["text_sentence_count"].tolist() == [ + 1, + 0, + 0, + 4, + 0, + 0, + 1, + 0, + 0, + 3, + 0, + 1, + 1, + 1, + 2, + 2, + 1, + 2, + 2, + 1, + ] + assert X_tr["text_letter_count"].tolist() == [ + 10, + 5, + 0, + 4, + 0, + 8, + 3, + 0, + 0, + 3, + 5, + 2, + 4, + 0, + 13, + 4, + 10, + 10, + 10, + 90, + ] + assert X_tr["text_lowercase_count"].tolist() == [ + 8, + 0, + 0, + 4, + 0, + 8, + 3, + 0, + 0, + 0, + 2, + 1, + 2, + 0, + 13, + 2, + 8, + 8, + 8, + 89, + ] + assert X_tr["text_is_empty"].tolist() == [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] + # ============================================================================== # OTHER METHOD TESTS # ============================================================================== + def test_get_feature_names_out(): """Test get_feature_names_out returns correct names.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) transformer = TextFeatures( - variables=["text"], - features=["char_count", "word_count"] + variables=["text"], features=["char_count", "word_count"] ) transformer.fit(X) @@ -388,13 +890,12 @@ def test_get_feature_names_out(): expected_features = ["text", "other", "text_char_count", "text_word_count"] assert feature_names == expected_features + def test_get_feature_names_out_with_drop(): """Test get_feature_names_out with drop_original=True.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) transformer = TextFeatures( - variables=["text"], - features=["char_count"], - drop_original=True + variables=["text"], features=["char_count"], drop_original=True ) transformer.fit(X) From 43b0b97a5da00f51f98eeada73984e7c9c027f37 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:27:31 -0600 Subject: [PATCH 14/33] Fix flake8 E501 line length errors by splitting long string literals --- tests/test_text/test_text_features.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index d176b3c39..1cdaa289a 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -234,7 +234,8 @@ def test_whitespace_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) @@ -312,7 +313,8 @@ def test_digit_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) @@ -412,7 +414,8 @@ def test_uppercase_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) @@ -540,7 +543,8 @@ def test_punctuation_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) @@ -618,7 +622,8 @@ def test_word_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) @@ -746,7 +751,8 @@ def test_basic_features(): "Hello. World", "Hello. World.", "Hello... World!?!", - "This is a proper sentence containing supercalifragilisticexpialidocious and exceptionally long words.", + "This is a proper sentence containing " + "supercalifragilisticexpialidocious and exceptionally long words.", ] } ) From 70aa894ae5ae4dd6922034d9d18b23a996118f7e Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:42:02 -0600 Subject: [PATCH 15/33] Remove unused sklearn tags logic to satisfy 100% Codecov threshold --- feature_engine/text/text_features.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index bd1ebe92a..16b52064f 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -341,14 +341,3 @@ def get_feature_names_out(self, input_features=None) -> List[str]: feature_names.append(f"{var}_{feature_name}") return feature_names - - def _more_tags(self): - tags_dict = _return_tags() - tags_dict["allow_nan"] = True - tags_dict["variables"] = "categorical" - return tags_dict - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.allow_nan = True - return tags From 41aed3c7080018bdd6a75985d257402878bc8fcf Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sat, 21 Feb 2026 22:44:38 -0600 Subject: [PATCH 16/33] Remove unused feature_engine.tags import flagged by flake8 --- feature_engine/text/text_features.py | 1 - 1 file changed, 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 16b52064f..9e9e5fc64 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -16,7 +16,6 @@ _check_X_matches_training_df, check_X, ) -from feature_engine.tags import _return_tags # Available text features and their computation functions TEXT_FEATURES = { From f830ca2e3139bc5dc533f27995e6105ce8c9626d Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:06:54 -0500 Subject: [PATCH 17/33] refactor for readability --- feature_engine/text/text_features.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 9e9e5fc64..21da9fab6 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -285,16 +285,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, cast(list[Union[str, int]], self.variables_)) + else: + X[self.variables_] = X[self.variables_].fillna("") # reorder variables to match train set X = X[self.feature_names_in_] - # Fill NaN with empty string for feature extraction - # This is safe because if missing_values is 'raise', it would have - # raised an error above. So any remaining NaNs are either intended to - # be filled or there are none. - X[self.variables_] = X[self.variables_].fillna("") - # Extract features for each text variable for var in self.variables_: for feature_name in self.features_: From 04e0795c3bbfae5eeb7fb6293bcb4ddd66541d6b Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:10:56 -0500 Subject: [PATCH 18/33] refactor fillna for computations with 0 --- feature_engine/text/text_features.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 21da9fab6..669fd675c 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -22,9 +22,8 @@ "char_count": lambda x: x.str.replace(r"\s+", "", regex=True).str.len(), "word_count": lambda x: x.str.strip().str.split().str.len(), "sentence_count": lambda x: x.str.count(r"[.!?]+"), - "avg_word_length": lambda x: ( - x.str.strip().str.len() / x.str.strip().str.split().str.len() - ).fillna(0), + "avg_word_length": lambda x: x.str.strip().str.len() + / x.str.strip().str.split().str.len(), "digit_count": lambda x: x.str.count(r"\d"), "letter_count": lambda x: x.str.count(r"[a-zA-Z]"), "uppercase_count": lambda x: x.str.count(r"[A-Z]"), @@ -42,10 +41,8 @@ "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), "unique_word_count": lambda x: (x.str.lower().str.split().apply(set).str.len()), - "lexical_diversity": lambda x: ( - x.str.strip().str.split().str.len() - / x.str.lower().str.split().apply(set).str.len() - ).fillna(0), + "lexical_diversity": lambda x: x.str.strip().str.split().str.len() + / x.str.lower().str.split().apply(set).str.len(), } From a318e39dfb9de58cf79a373922450a020e3972b5 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:16:21 -0500 Subject: [PATCH 19/33] update docstring --- feature_engine/text/text_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 669fd675c..86a358270 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -52,7 +52,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): transformer is useful for extracting basic text statistics that can be used as features in machine learning models. - A list of variables must be passed as an argument. + A list with the text variables must be passed as an argument. More details in the :ref:`User Guide `. From c2da382d6019321c2fa5de5c0cf2b4dcc37d0dd4 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:25:58 -0500 Subject: [PATCH 20/33] removes sentence from fit docstring --- feature_engine/text/text_features.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 86a358270..3c1784053 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -199,9 +199,6 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameters. - Stores feature names and validates that the specified variables are - present. - Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] From 96b7df99616de5a3bbd8ad8cad08d39575bb81e1 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:28:26 -0500 Subject: [PATCH 21/33] upadte fit method dosctring --- feature_engine/text/text_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 3c1784053..d60d374fe 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -206,7 +206,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): variables to transform. y: pandas Series, or np.array. Defaults to None. - It is not needed in this transformer. You can pass y or None. + The target. It is not needed in this transformer. You can pass y or None. """ # check input dataframe From a75b1ce413e6030237dbff653c61dcead674efdf Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 08:34:42 -0500 Subject: [PATCH 22/33] replace function to check for na --- feature_engine/text/text_features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index d60d374fe..d95afa167 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -12,7 +12,7 @@ _check_param_missing_values, ) from feature_engine.dataframe_checks import ( - _check_contains_na, + _check_optional_contains_na, _check_X_matches_training_df, check_X, ) @@ -236,7 +236,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check if dataset contains na if self.missing_values == "raise": - _check_contains_na(X, cast(list[Union[str, int]], self.variables_)) + _check_optional_contains_na(X, self.variables_) # Set features to extract if self.features is None: @@ -278,7 +278,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if dataset contains na if self.missing_values == "raise": - _check_contains_na(X, cast(list[Union[str, int]], self.variables_)) + _check_optional_contains_na(X, self.variables_) else: X[self.variables_] = X[self.variables_].fillna("") From 9af9ab34c23158981f3da5241f2539c88c49d83d Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:32:57 -0500 Subject: [PATCH 23/33] update first init test --- tests/test_text/test_text_features.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 1cdaa289a..a75b817b8 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -20,8 +20,7 @@ ], ) def test_invalid_variables_raises_error(invalid_variables): - """Test that invalid variables parameter raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): + with pytest.raises(ValueError, match="variables must be a string or a list of"): TextFeatures(variables=invalid_variables) From b749709af968143777f1b1cb8f947c2fff173336 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:33:47 -0500 Subject: [PATCH 24/33] remove comment from test --- tests/test_text/test_text_features.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index a75b817b8..3209878fe 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -38,7 +38,6 @@ def test_invalid_variables_raises_error(invalid_variables): ], ) def test_invalid_features_raises_error(invalid_features, err_msg): - """Test that invalid features parameter raises ValueError.""" with pytest.raises(ValueError, match=err_msg): TextFeatures(variables=["text"], features=invalid_features) From a914a8024073cccbf0cc703ad806279302993100 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:46:36 -0500 Subject: [PATCH 25/33] add parameter to fit attributes test --- tests/test_text/test_text_features.py | 30 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 3209878fe..3414d7f89 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -47,16 +47,32 @@ def test_invalid_features_raises_error(invalid_features, err_msg): # ============================================================================== -def test_fit_stores_attributes(): +@pytest.mark.parametrize( + "variables, features", + [ + ("text", None), + (["string"], ["char_count"]), + (["text", "string"], ["sentence_count", "avg_word_length"]), + ], +) +def test_fit_stores_attributes(variables, features): """Test that fit stores expected attributes.""" - X = pd.DataFrame({"text": ["Hello"]}) - transformer = TextFeatures(variables=["text"]) + X = pd.DataFrame({"text": ["Hello"], "string": ["Bye"]}) + transformer = TextFeatures(variables=variables, features=features) transformer.fit(X) - assert transformer.variables_ == ["text"] - assert transformer.features_ == list(TEXT_FEATURES.keys()) - assert transformer.feature_names_in_ == ["text"] - assert transformer.n_features_in_ == 1 + assert ( + transformer.variables_ == variables + if isinstance(variables, list) + else transformer.variables_ == [variables] + ) + assert ( + transformer.features_ == list(TEXT_FEATURES.keys()) + if features is None + else transformer.features_ == features + ) + assert transformer.feature_names_in_ == ["text", "string"] + assert transformer.n_features_in_ == 2 def test_missing_variable_raises_error(): From d00165214f5e6beee8a398764097894a4110e039 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:51:04 -0500 Subject: [PATCH 26/33] add parameters to test var type --- tests/test_text/test_text_features.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 3414d7f89..9695ae4f0 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -56,7 +56,6 @@ def test_invalid_features_raises_error(invalid_features, err_msg): ], ) def test_fit_stores_attributes(variables, features): - """Test that fit stores expected attributes.""" X = pd.DataFrame({"text": ["Hello"], "string": ["Bye"]}) transformer = TextFeatures(variables=variables, features=features) transformer.fit(X) @@ -76,19 +75,17 @@ def test_fit_stores_attributes(variables, features): def test_missing_variable_raises_error(): - """Test that missing variable raises ValueError on fit.""" X = pd.DataFrame({"text": ["Hello"]}) transformer = TextFeatures(variables=["nonexistent"]) with pytest.raises(ValueError, match="not present in the dataframe"): transformer.fit(X) -def test_no_text_columns_raises_error(): - """Test that no text columns raises error.""" - X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transformer = TextFeatures(variables=["a"]) +@pytest.mark.parametrize("variables", ["Age","Marks", "dob"]) +def test_no_text_columns_raises_error(df_vartypes, variables): + transformer = TextFeatures(variables=variables) with pytest.raises(ValueError, match="not object or string"): - transformer.fit(X) + transformer.fit(df_vartypes) def test_nan_handling_raise_error_fit(): From bbe7a94629d80a668073650548a9fddfded71eba Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:53:46 -0500 Subject: [PATCH 27/33] add error message to na error test --- tests/test_text/test_text_features.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 9695ae4f0..2c9153fac 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -88,14 +88,13 @@ def test_no_text_columns_raises_error(df_vartypes, variables): transformer.fit(df_vartypes) -def test_nan_handling_raise_error_fit(): - """Test handling of NaN values when missing_values is 'raise' on fit.""" - X = pd.DataFrame({"text": ["Hello", None, "World"]}) +def test_nan_handling_raise_error_fit(df_na): transformer = TextFeatures( - variables=["text"], features=["char_count"], missing_values="raise" + variables=["City"], features=["char_count"], missing_values="raise" ) - with pytest.raises(ValueError): - transformer.fit(X) + msg = "`missing_values='ignore'` when initialising this transformer" + with pytest.raises(ValueError, match=msg): + transformer.fit(df_na) # ============================================================================== From 2fef52d24038d5db5d46be393e93e2bf7c914ee2 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 20:58:57 -0500 Subject: [PATCH 28/33] add error message to na test --- tests/test_text/test_text_features.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 2c9153fac..80ce1fb29 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -103,7 +103,6 @@ def test_nan_handling_raise_error_fit(df_na): def test_transform_on_new_data(): - """Test transform works on new data after fit.""" X_train = pd.DataFrame({"text": ["Hello World", "Foo Bar"]}) X_test = pd.DataFrame({"text": ["New Data", "Test 123"]}) @@ -118,14 +117,14 @@ def test_transform_on_new_data(): def test_nan_handling_raise_error_transform(): - """Test handling of NaN values when missing_values is 'raise' on transform.""" X_train = pd.DataFrame({"text": ["Hello", "World"]}) X_test = pd.DataFrame({"text": ["Hello", None, "World"]}) transformer = TextFeatures( variables=["text"], features=["char_count"], missing_values="raise" ) transformer.fit(X_train) - with pytest.raises(ValueError): + msg = "`missing_values='ignore'` when initialising this transformer" + with pytest.raises(ValueError, match=msg): transformer.transform(X_test) From 52047d8b22f8478d7aa8d058ca4ef1a2a4601a51 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 21:11:56 -0500 Subject: [PATCH 29/33] refactor text features tests --- tests/test_text/test_text_features.py | 180 +++----------------------- 1 file changed, 20 insertions(+), 160 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 80ce1fb29..fa03b8275 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -129,7 +129,6 @@ def test_nan_handling_raise_error_transform(): def test_nan_handling(): - """Test handling of NaN values.""" X = pd.DataFrame({"text": ["Hello", None, "World"]}) transformer = TextFeatures(variables=["text"], features=["char_count"]) X_tr = transformer.fit_transform(X) @@ -215,13 +214,12 @@ def test_multiple_text_columns(): # ============================================================================== -# TRANSFORM TESTS - INDIVIDUAL FEATURES +# TRANSFORM - TEST TEXT FEATURES # ============================================================================== - -def test_whitespace_features(): - """Test whitespace_features.""" - X = pd.DataFrame( +@pytest.fixture(scope="module") +def df_text(): + df = pd.DataFrame( { "text": [ "Hello World!", @@ -248,10 +246,12 @@ def test_whitespace_features(): ] } ) - transformer = TextFeatures( - variables=["text"], features=["whitespace_count", "whitespace_ratio"] - ) - X_tr = transformer.fit_transform(X) + return df + +def test_whitespace_features(df_text): + text_features = ["whitespace_count", "whitespace_ratio"] + transformer = TextFeatures(variables=["text"], features=text_features) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_whitespace_count"].tolist() == [ 1, 0, @@ -298,39 +298,11 @@ def test_whitespace_features(): ] -def test_digit_features(): - """Test digit_features.""" - X = pd.DataFrame( - { - "text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing " - "supercalifragilisticexpialidocious and exceptionally long words.", - ] - } - ) +def test_digit_features(df_text): transformer = TextFeatures( variables=["text"], features=["digit_count", "digit_ratio", "has_digits"] ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_digit_count"].tolist() == [ 0, 0, @@ -399,35 +371,7 @@ def test_digit_features(): ] -def test_uppercase_features(): - """Test uppercase_features.""" - X = pd.DataFrame( - { - "text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing " - "supercalifragilisticexpialidocious and exceptionally long words.", - ] - } - ) +def test_uppercase_features(df_text): transformer = TextFeatures( variables=["text"], features=[ @@ -437,7 +381,7 @@ def test_uppercase_features(): "starts_with_uppercase", ], ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_uppercase_count"].tolist() == [ 2, 5, @@ -528,39 +472,11 @@ def test_uppercase_features(): ] -def test_punctuation_features(): - """Test punctuation_features.""" - X = pd.DataFrame( - { - "text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing " - "supercalifragilisticexpialidocious and exceptionally long words.", - ] - } - ) +def test_punctuation_features(df_text): transformer = TextFeatures( variables=["text"], features=["special_char_count", "ends_with_punctuation"] ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_special_char_count"].tolist() == [ 1, 0, @@ -607,35 +523,7 @@ def test_punctuation_features(): ] -def test_word_features(): - """Test word_features.""" - X = pd.DataFrame( - { - "text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing " - "supercalifragilisticexpialidocious and exceptionally long words.", - ] - } - ) +def test_word_features(df_text): transformer = TextFeatures( variables=["text"], features=[ @@ -645,7 +533,7 @@ def test_word_features(): "avg_word_length", ], ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_word_count"].tolist() == [ 2, 1, @@ -736,35 +624,7 @@ def test_word_features(): ] -def test_basic_features(): - """Test basic_features.""" - X = pd.DataFrame( - { - "text": [ - "Hello World!", - "HELLO", - "12345", - "e.g. i.e.", - " ", - " trailing ", - "abc...", - "", - None, - "A? B! C.", - "HeLLo", - "Hi! @#", - "A1b2 C3d4!@#$", - "???", - "i.e., this is wrong", - "Is 1 > 2? No, 100%!", - "Hello. World", - "Hello. World.", - "Hello... World!?!", - "This is a proper sentence containing " - "supercalifragilisticexpialidocious and exceptionally long words.", - ] - } - ) +def test_basic_features(df_text): transformer = TextFeatures( variables=["text"], features=[ @@ -775,7 +635,7 @@ def test_basic_features(): "is_empty", ], ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_text) assert X_tr["text_char_count"].tolist() == [ 11, 5, From 24f8675aff7b0da06d197fd068386cb75cb2e6f7 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 21:13:03 -0500 Subject: [PATCH 30/33] code style --- tests/test_text/test_text_features.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index fa03b8275..ba906885f 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -81,7 +81,7 @@ def test_missing_variable_raises_error(): transformer.fit(X) -@pytest.mark.parametrize("variables", ["Age","Marks", "dob"]) +@pytest.mark.parametrize("variables", ["Age", "Marks", "dob"]) def test_no_text_columns_raises_error(df_vartypes, variables): transformer = TextFeatures(variables=variables) with pytest.raises(ValueError, match="not object or string"): @@ -217,6 +217,7 @@ def test_multiple_text_columns(): # TRANSFORM - TEST TEXT FEATURES # ============================================================================== + @pytest.fixture(scope="module") def df_text(): df = pd.DataFrame( @@ -248,6 +249,7 @@ def df_text(): ) return df + def test_whitespace_features(df_text): text_features = ["whitespace_count", "whitespace_ratio"] transformer = TextFeatures(variables=["text"], features=text_features) @@ -754,7 +756,6 @@ def test_basic_features(df_text): def test_get_feature_names_out(): - """Test get_feature_names_out returns correct names.""" X = pd.DataFrame({"text": ["Hello"], "other": [1]}) transformer = TextFeatures( variables=["text"], features=["char_count", "word_count"] From 6176931b87e9e94bc0d80497b5e7c545a175c8ec Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 21:31:21 -0500 Subject: [PATCH 31/33] break lines in user guide --- docs/user_guide/text/TextFeatures.rst | 77 ++++++++++++++++++++------- 1 file changed, 58 insertions(+), 19 deletions(-) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index 79397b81a..84d1b4e22 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -5,11 +5,20 @@ Extracting Features from Text ============================= -Short pieces of text are often found among the variables in our datasets. For example, in insurance, a text variable can describe the circumstances of an accident. Customer feedback is also stored as a text variable. +Short pieces of text are often found among the variables in our datasets. For example, +in insurance, a text variable can describe the circumstances of an accident. Customer +feedback is also stored as a text variable. -While text data as such can't be used to train machine learning models, we can extract a lot of numerical information from these texts, which can provide predictive features to train machine learning models. +While text data as such can't be used to train machine learning models, we can extract +a lot of numerical information from these texts, which can provide predictive features +to train machine learning models. + +Feature-engine allows you to quickly extract numerical features from short pieces of +text, to complement your predictive models. These features aim to capture a piece of +text’s complexity by looking at some statistical parameters of the text, such as the +word length and count, the number of words and unique words used, the number of +sentences, and so on. -Feature-engine allows you to quickly extract numerical features from short pieces of text, to complement your predictive models. These features aim to capture a piece of text’s complexity by looking at some statistical parameters of the text, such as the word length and count, the number of words and unique words used, the number of sentences, and so on. :class:`TextFeatures()` extracts many numerical features from text out-of-the-box. TextFeatures @@ -57,16 +66,26 @@ sentence-ending, which avoids overestimating the count in emphatic text. However, this is still a simple heuristic. It won't handle edge cases like abbreviations (e.g., 'Dr.', 'U.S.', 'e.g.', 'i.e.') or text without punctuation. These abbreviations -will be counted as sentence endings, resulting in an overestimate of the actual sentence count. +will be counted as sentence endings, resulting in an overestimate of the actual sentence +count. -The features **number of unique words** and **lexical diversity** are intended to capture the complexity of the text. Simpler texts have few unique words and tend to repeat them. More complex texts use a wider array of words and tend not to repeat them. Hence, in more complex texts, both the number of unique words and the lexical diversity are greater. +The features **number of unique words** and **lexical diversity** are intended to +capture the complexity of the text. Simpler texts have few unique words and tend to +repeat them. More complex texts use a wider array of words and tend not to repeat them. +Hence, in more complex texts, both the number of unique words and the lexical diversity +are greater. Handling missing values ----------------------- -By default, :class:`TextFeatures()` ignores missing values by treating them as empty strings (`missing_values='ignore'`). You can change this behavior by setting the parameter to `'raise'` if you prefer the transformer to raise an error when encountering missing data. +By default, :class:`TextFeatures()` ignores missing values by treating them as empty +strings (`missing_values='ignore'`). You can change this behavior by setting the +parameter to `'raise'` if you prefer the transformer to raise an error when encountering +missing data. + In this case, missing values will be treated as empty strings, and the numerical features -will be calculated accordingly (e.g., word count and character count will be 0) as shown in the following example: +will be calculated accordingly (e.g., word count and character count will be 0) as shown +in the following example: .. code:: python @@ -90,7 +109,8 @@ will be calculated accordingly (e.g., word count and character count will be 0) print(X_transformed) -In the resulting dataframe, we see that the row with NaN returned 0 in the character count: +In the resulting dataframe, we see that the row with NaN returned 0 in the character +count: .. code-block:: none @@ -138,22 +158,30 @@ The input dataframe looks like this: 2 OK for the price. 3 out of 5 stars. Average 3 TERRIBLE!!! DO NOT BUY! Awful -Now let's extract 5 specific text features: the number of words, the number of characters, the number of sentences, whether the text has digits, and the ratio of upper- to lowercase: +Now let's extract 5 specific text features: the number of words, the number of +characters, the number of sentences, whether the text has digits, and the ratio of +upper- to lowercase: .. code:: python # Set up the transformer with specific features tf = TextFeatures( variables=['review'], - features=['word_count', 'char_count', 'sentence_count', 'has_digits', 'uppercase_ratio'] - ) + features=[ + 'word_count', + 'char_count', + 'sentence_count', + 'has_digits', + 'uppercase_ratio', + ]) # Fit and transform X_transformed = tf.fit_transform(X) print(X_transformed) -In the following output, we see the resulting dataframe containing the numerical features extracted from the pieces of text: +In the following output, we see the resulting dataframe containing the numerical +features extracted from the pieces of text: .. code-block:: none @@ -225,7 +253,8 @@ The output dataframe contains all 20 text features extracted from the `review` c Dropping original columns ~~~~~~~~~~~~~~~~~~~~~~~~~ -You can drop the original text columns after extracting features, by setting the parameter `drop_original` to `True`: +You can drop the original text columns after extracting features, by setting the +parameter `drop_original` to `True`: .. code:: python @@ -239,7 +268,8 @@ You can drop the original text columns after extracting features, by setting the print(X_transformed) -The original `'review'` column has been removed, and only the `'title'` column and the extracted features remain: +The original `'review'` column has been removed, and only the `'title'` column and the +extracted features remain: .. code-block:: none @@ -252,9 +282,13 @@ The original `'review'` column has been removed, and only the `'title'` column a Combining with scikit-learn Bag-of-Words ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In most NLP tasks, it is common to use bag-of-words (e.g., `CountVectorizer`) or TF-IDF (e.g., `TfidfVectorizer`) to represent the text. :class:`TextFeatures()` can be used alongside these transformers to provide additional metadata that might improve model performance. +In most NLP tasks, it is common to use bag-of-words (e.g., `CountVectorizer`) or TF-IDF +(e.g., `TfidfVectorizer`) to represent the text. :class:`TextFeatures()` can be used +alongside these transformers to provide additional metadata that might improve model +performance. -In the following example, we compare a baseline model using only TF-IDF with a model that combines TF-IDF and :class:`TextFeatures()` metadata: +In the following example, we compare a baseline model using only TF-IDF with a model +that combines TF-IDF and :class:`TextFeatures()` metadata: .. code:: python @@ -284,7 +318,8 @@ The input dataframe contains the raw text of newsgroup posts: 892 From: ggg@hhh.iii\nSubject: NHL Scores\nOrga... 317 From: jjj@kkk.lll (Bob Wilson)\nSubject: Re:... -Now let's set up two pipelines to compare a baseline model using only TF-IDF with a model that combines TF-IDF and :class:`TextFeatures()` metadata: +Now let's set up two pipelines to compare a baseline model using only TF-IDF with a +model that combines TF-IDF and :class:`TextFeatures()` metadata: .. code:: python @@ -316,11 +351,15 @@ Now let's set up two pipelines to compare a baseline model using only TF-IDF wit combined_pipe.fit(X_train, y_train) print(f"Combined Accuracy: {combined_pipe.score(X_test, y_test):.3f}") -Below we see the accuracy of a model trained using only the bag of words, respect to a model trained using both the bag of words and the additional meta data: +Below we see the accuracy of a model trained using only the bag of words, respect to a +model trained using both the bag of words and the additional meta data: .. code-block:: none TF-IDF Accuracy: 0.957 Combined Accuracy: 0.963 -By adding statistical metadata through :class:`TextFeatures()`, we provided the model with information about text length, complexity, and style that is not explicitly captured by a word-count-based approach like TF-IDF, leading to a small but noticeable improvement in performance. +By adding statistical metadata through :class:`TextFeatures()`, we provided the model +with information about text length, complexity, and style that is not explicitly +captured by a word-count-based approach like TF-IDF, leading to a small but noticeable +improvement in performance. From 3ca6b17c4ee5d1144f7026080eb7784cd5efbae2 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sun, 22 Feb 2026 21:35:46 -0500 Subject: [PATCH 32/33] fix code style --- feature_engine/text/text_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index d95afa167..fe0760f61 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -1,6 +1,6 @@ # Authors: Ankit Hemant Lade (contributor) # License: BSD 3 clause -from typing import List, Optional, Union, cast +from typing import List, Optional, Union import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin From 1e2899099910693dd460f1417fe0d63085b1c298 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Sun, 22 Feb 2026 21:59:27 -0600 Subject: [PATCH 33/33] fix: restore cast to avoid mypy list invariant error --- feature_engine/text/text_features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index fe0760f61..8299863ff 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -1,6 +1,6 @@ # Authors: Ankit Hemant Lade (contributor) # License: BSD 3 clause -from typing import List, Optional, Union +from typing import List, Optional, Union, cast import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin @@ -236,7 +236,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check if dataset contains na if self.missing_values == "raise": - _check_optional_contains_na(X, self.variables_) + _check_optional_contains_na(X, cast(list[Union[str, int]], self.variables_)) # Set features to extract if self.features is None: @@ -278,7 +278,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if dataset contains na if self.missing_values == "raise": - _check_optional_contains_na(X, self.variables_) + _check_optional_contains_na(X, cast(list[Union[str, int]], self.variables_)) else: X[self.variables_] = X[self.variables_].fillna("")