Source code for fugue.extensions.processor.processor

from abc import ABC, abstractmethod

from fugue.dataframe import DataFrame, DataFrames
from fugue.extensions.context import ExtensionContext


[docs] class Processor(ExtensionContext, ABC): """The interface to process one or multiple incoming dataframes and return one DataFrame. For example dropping a column of df should be a type of Processor. Processor is task level extension, running on driver, and execution engine aware. To implement this class, you should not have ``__init__``, please directly implement the interface functions. .. note:: Before implementing this class, do you really need to implement this interface? Do you know the interfaceless feature of Fugue? Implementing Processor is commonly unnecessary. You can choose the interfaceless approach which may decouple your code from Fugue. .. seealso:: Please read :doc:`Processor Tutorial <tutorial:tutorials/extensions/processor>` """
[docs] @abstractmethod def process(self, dfs: DataFrames) -> DataFrame: # pragma: no cover """Process the collection of dataframes on driver side .. note:: * It runs on driver side * The dataframes are not necessarily local, for example a SparkDataFrame * It is engine aware, you can put platform dependent code in it (for example native pyspark code) but by doing so your code may not be portable. If you only use the functions of the general ExecutionEngine, it's still portable. :param dfs: dataframe collection to process :return: the result dataframe """ raise NotImplementedError