fugue.column#

fugue.column.expressions#

class fugue.column.expressions.ColumnExpr[source]#

Bases: object

Fugue column expression class. It is inspired from pyspark.sql.Column and it is working in progress.

New Since

0.6.0

Caution

This is a base class of different column classes, and users are not supposed to construct this class directly. Use col() and lit() instead.

alias(as_name)[source]#

Assign or remove alias of a column. To remove, set as_name to empty

Returns

a new column with the alias value

Parameters

as_name (str) –

Return type

ColumnExpr

Examples

assert "b" == col("a").alias("b").as_name
assert "x" == (col("a") * 2).alias("x").as_name
assert "" == col("a").alias("b").alias("").as_name
property as_name: str#

The name assigned by alias()

Returns

the alias

Examples

assert "" == col("a").as_name
assert "b" == col("a").alias("b").as_name
assert "x" == (col("a") * 2).alias("x").as_name
property as_type: Optional[DataType]#

The type assigned by cast()

Returns

the pyarrow datatype if cast() was called otherwise None

Examples

import pyarrow as pa

assert col("a").as_type is None
assert pa.int64() == col("a").cast(int).as_type
assert pa.string() == (col("a") * 2).cast(str).as_type
property body_str: str#

The string expression of this column without cast type and alias. This is only used for debug purpose. It is not SQL expression.

Returns

the string expression

cast(data_type)[source]#

Cast the column to a new data type

Parameters

data_type (Any) – It can be string expressions, python primitive types, python datetime.datetime and pyarrow types. For details read Fugue Data Types

Returns

a new column instance with the assigned data type

Return type

ColumnExpr

Caution

Currently, casting to struct or list type has undefined behavior.

Examples

import pyarrow as pa

assert pa.int64() == col("a").cast(int).as_type
assert pa.string() == col("a").cast(str).as_type
assert pa.float64() == col("a").cast(float).as_type
assert pa._bool() == col("a").cast(bool).as_type

# string follows the type expression of Triad Schema
assert pa.int32() == col("a").cast("int").as_type
assert pa.int32() == col("a").cast("int32").as_type

assert pa.int32() == col("a").cast(pa.int32()).as_type
infer_alias()[source]#

Infer alias of a column. If the column’s output_name() is not empty then it returns itself without change. Otherwise it tries to infer alias from the underlying columns.

Returns

a column instance with inferred alias

Return type

ColumnExpr

Caution

Users should not use it directly.

Examples

import fugue.column.functions as f

assert "a" == col("a").infer_alias().output_name
assert "" == (col("a") * 2).infer_alias().output_name
assert "a" == col("a").is_null().infer_alias().output_name
assert "a" == f.max(col("a").is_null()).infer_alias().output_name
infer_type(schema)[source]#

Infer data type of this column given the input schema

Parameters

schema (Schema) – the schema instance to infer from

Returns

a pyarrow datatype or None if failed to infer

Return type

Optional[DataType]

Caution

Users should not use it directly.

Examples

import pyarrow as pa
from triad import Schema
import fugue.column.functions as f

schema = Schema("a:int,b:str")

assert pa.int32() == col("a").infer_schema(schema)
assert pa.int32() == (-col("a")).infer_schema(schema)
# due to overflow risk, can't infer certain operations
assert (col("a")+1).infer_schema(schema) is None
assert (col("a")+col("a")).infer_schema(schema) is None
assert pa.int32() == f.max(col("a")).infer_schema(schema)
assert pa.int32() == f.min(col("a")).infer_schema(schema)
assert f.sum(col("a")).infer_schema(schema) is None
is_null()[source]#

Same as SQL <col> IS NULL.

Returns

a new column with the boolean values

Return type

ColumnExpr

property name: str#

The original name of this column, default empty

Returns

the name

Examples

assert "a" == col("a").name
assert "b" == col("a").alias("b").name
assert "" == lit(1).name
assert "" == (col("a") * 2).name
not_null()[source]#

Same as SQL <col> IS NOT NULL.

Returns

a new column with the boolean values

Return type

ColumnExpr

property output_name: str#

The name assigned by alias(), but if empty then return the original column name

Returns

the alias or the original column name

Examples

assert "a" == col("a").output_name
assert "b" == col("a").alias("b").output_name
assert "x" == (col("a") * 2).alias("x").output_name
fugue.column.expressions.all_cols()[source]#

The * expression in SQL

Return type

ColumnExpr

fugue.column.expressions.col(obj, alias='')[source]#

Convert the obj to a ColumnExpr object

Parameters
  • obj (Union[str, ColumnExpr]) – a string representing a column name or a ColumnExpr object

  • alias (str) – the alias of this column, defaults to “” (no alias)

Returns

a literal column expression

Return type

ColumnExpr

New Since

0.6.0

Examples

import fugue.column import col
import fugue.column.functions as f

col("a")
col("a").alias("x")
col("a", "x")

# unary operations
-col("a")  # negative value of a
~col("a")  # NOT a
col("a").is_null()  # a IS NULL
col("a").not_null()  # a IS NOT NULL

# binary operations
col("a") + 1  # col("a") + lit(1)
1 - col("a")  # lit(1) - col("a")
col("a") * col("b")
col("a") / col("b")

# binary boolean expressions
col("a") == 1  # col("a") == lit(1)
2 != col("a")  # col("a") != lit(2)
col("a") < 5
col("a") > 5
col("a") <= 5
col("a") >= 5
(col("a") < col("b")) & (col("b") > 1) | col("c").is_null()

# with functions
f.max(col("a"))
f.max(col("a")+col("b"))
f.max(col("a")) + f.min(col("b"))
f.count_distinct(col("a")).alias("dcount")
fugue.column.expressions.function(name, *args, arg_distinct=False, **kwargs)[source]#

Construct a function expression

Parameters
  • name (str) – the name of the function

  • arg_distinct (bool) – whether to add DISTINCT before all arguments, defaults to False

  • args (Any) –

Returns

the function expression

Return type

ColumnExpr

Caution

Users should not use this directly

fugue.column.expressions.lit(obj, alias='')[source]#

Convert the obj to a literal column. Currently obj must be int, bool, float or str, otherwise an exception will be raised

Parameters
  • obj (Any) – an arbitrary value

  • alias (str) – the alias of this literal column, defaults to “” (no alias)

Returns

a literal column expression

Return type

ColumnExpr

New Since

0.6.0

Examples

import fugue.column import lit

lit("abc")
lit(100).alias("x")
lit(100, "x")
fugue.column.expressions.null()[source]#

Equivalent to lit(None), the NULL value

Returns

lit(None)

Return type

ColumnExpr

New Since

0.6.0

fugue.column.functions#

fugue.column.functions.avg(col)[source]#

SQL AVG function (aggregation)

Parameters

col (ColumnExpr) – the column to find average

Return type

ColumnExpr

Note

  • this function cannot infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

f.avg(col("a"))  # AVG(a) AS a

# you can specify explicitly
# CAST(AVG(a) AS double) AS a
f.avg(col("a")).cast(float)
fugue.column.functions.coalesce(*args)[source]#

SQL COALESCE function

Parameters

args (Any) – If a value is not ColumnExpr then it’s converted to a literal column by col()

Return type

ColumnExpr

Note

this function can infer neither type nor alias

New Since

0.6.0

Examples

import fugue.column.functions as f

f.coalesce(col("a"), col("b")+col("c"), 1)
fugue.column.functions.count(col)[source]#

SQL COUNT function (aggregation)

Parameters

col (ColumnExpr) – the column to find count

Return type

ColumnExpr

Note

  • this function cannot infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

f.count(all_cols())  # COUNT(*)
f.count(col("a"))  # COUNT(a) AS a

# you can specify explicitly
# CAST(COUNT(a) AS double) AS a
f.count(col("a")).cast(float)
fugue.column.functions.count_distinct(col)[source]#

SQL COUNT DISTINCT function (aggregation)

Parameters

col (ColumnExpr) – the column to find distinct element count

Return type

ColumnExpr

Note

  • this function cannot infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

f.count_distinct(all_cols())  # COUNT(DISTINCT *)
f.count_distinct(col("a"))  # COUNT(DISTINCT a) AS a

# you can specify explicitly
# CAST(COUNT(DISTINCT a) AS double) AS a
f.count_distinct(col("a")).cast(float)
fugue.column.functions.first(col)[source]#

SQL FIRST function (aggregation)

Parameters

col (ColumnExpr) – the column to find first

Return type

ColumnExpr

Note

  • this function can infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

# assume col a has type double
f.first(col("a"))  # CAST(FIRST(a) AS double) AS a
f.first(-col("a"))  # CAST(FIRST(-a) AS double) AS a

# neither type nor alias can be inferred in the following cases
f.first(col("a")+1)
f.first(col("a")+col("b"))

# you can specify explicitly
# CAST(FIRST(a+b) AS int) AS x
f.first(col("a")+col("b")).cast(int).alias("x")
fugue.column.functions.is_agg(column)[source]#

Check if a column contains aggregation operation

Parameters
  • col – the column to check

  • column (Any) –

Returns

whether the column is ColumnExpr and contains aggregation operations

Return type

bool

New Since

0.6.0

Examples

import fugue.column.functions as f

assert not f.is_agg(1)
assert not f.is_agg(col("a"))
assert not f.is_agg(col("a")+lit(1))

assert f.is_agg(f.max(col("a")))
assert f.is_agg(-f.max(col("a")))
assert f.is_agg(f.max(col("a")+1))
assert f.is_agg(f.max(col("a"))+f.min(col("a"))))
fugue.column.functions.last(col)[source]#

SQL LAST function (aggregation)

Parameters

col (ColumnExpr) – the column to find last

Return type

ColumnExpr

Note

  • this function can infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

# assume col a has type double
f.last(col("a"))  # CAST(LAST(a) AS double) AS a
f.last(-col("a"))  # CAST(LAST(-a) AS double) AS a

# neither type nor alias can be inferred in the following cases
f.last(col("a")+1)
f.last(col("a")+col("b"))

# you can specify explicitly
# CAST(LAST(a+b) AS int) AS x
f.last(col("a")+col("b")).cast(int).alias("x")
fugue.column.functions.max(col)[source]#

SQL MAX function (aggregation)

Parameters

col (ColumnExpr) – the column to find max

Return type

ColumnExpr

Note

  • this function can infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

# assume col a has type double
f.max(col("a"))  # CAST(MAX(a) AS double) AS a
f.max(-col("a"))  # CAST(MAX(-a) AS double) AS a

# neither type nor alias can be inferred in the following cases
f.max(col("a")+1)
f.max(col("a")+col("b"))

# you can specify explicitly
# CAST(MAX(a+b) AS int) AS x
f.max(col("a")+col("b")).cast(int).alias("x")
fugue.column.functions.min(col)[source]#

SQL MIN function (aggregation)

Parameters

col (ColumnExpr) – the column to find min

Return type

ColumnExpr

Note

  • this function can infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

# assume col a has type double
f.min(col("a"))  # CAST(MIN(a) AS double) AS a
f.min(-col("a"))  # CAST(MIN(-a) AS double) AS a

# neither type nor alias can be inferred in the following cases
f.min(col("a")+1)
f.min(col("a")+col("b"))

# you can specify explicitly
# CAST(MIN(a+b) AS int) AS x
f.min(col("a")+col("b")).cast(int).alias("x")
fugue.column.functions.sum(col)[source]#

SQL SUM function (aggregation)

Parameters

col (ColumnExpr) – the column to find sum

Return type

ColumnExpr

Note

  • this function cannot infer type from col type

  • this function can infer alias from col’s inferred alias

New Since

0.6.0

Examples

import fugue.column.functions as f

f.sum(col("a"))  # SUM(a) AS a

# you can specify explicitly
# CAST(SUM(a) AS double) AS a
f.sum(col("a")).cast(float)

fugue.column.sql#

class fugue.column.sql.SQLExpressionGenerator(enable_cast=True)[source]#

Bases: object

SQL generator for SelectColumns

Parameters

enable_cast (bool) – whether convert cast into the statement, defaults to True

New Since

0.6.0

add_func_handler(name, handler)[source]#

Add special function handler.

Parameters
  • name (str) – name of the function

  • handler (Callable[[_FuncExpr], Iterable[str]]) – the function to convert the function expression to SQL clause

Returns

the instance itself

Return type

SQLExpressionGenerator

Caution

Users should not use this directly

correct_select_schema(input_schema, select, output_schema)[source]#

Do partial schema inference from input_schema and select columns, then compare with the SQL output dataframe schema, and return the different part as a new schema, or None if there is no difference

Parameters
  • input_schema (Schema) – input dataframe schema for the select statement

  • select (SelectColumns) – the collection of select columns

  • output_schema (Schema) – schema of the output dataframe after executing the SQL

Returns

the difference as a new schema or None if no difference

Return type

Optional[Schema]

Tip

This is particularly useful when the SQL engine messed up the schema of the output. For example, SELECT * should return a dataframe with the same schema of the input. However, for example a column a:int could become a:long in the output dataframe because of information loss. This function is designed to make corrections on column types when they can be inferred. This may not be perfect but it can solve major discrepancies.

generate(expr)[source]#

Convert ColumnExpr to SQL clause

Parameters

expr (ColumnExpr) – the column expression to convert

Returns

the SQL clause for this expression

Return type

str

select(columns, table, where=None, having=None)[source]#

Construct the full SELECT statement on a single table

Parameters
  • columns (SelectColumns) – columns to select, it may contain aggregations, if so, the group keys are inferred. See group_keys()

  • table (str) – table name to select from

  • where (Optional[ColumnExpr]) – WHERE condition, defaults to None

  • having (Optional[ColumnExpr]) – HAVING condition, defaults to None. It is used only when there is aggregation

Returns

the full SELECT statement

Return type

Iterable[Tuple[bool, str]]

type_to_expr(data_type)[source]#
Parameters

data_type (DataType) –

where(condition, table)[source]#

Generate a SELECT * statement with the given where clause

Parameters
  • condition (ColumnExpr) – column expression for WHERE

  • table (str) – table name for FROM

Returns

the SQL statement

Raises

ValueError – if condition contains aggregation

Return type

Iterable[Tuple[bool, str]]

Examples

gen = SQLExpressionGenerator(enable_cast=False)

# SELECT * FROM tb WHERE a>1 AND b IS NULL
gen.where((col("a")>1) & col("b").is_null(), "tb")
class fugue.column.sql.SelectColumns(*cols, arg_distinct=False)[source]#

Bases: object

SQL SELECT columns collection.

Parameters
  • cols (ColumnExpr) – collection of ColumnExpr

  • arg_distinct (bool) – whether this is SELECT DISTINCT, defaults to False

New Since

0.6.0

property agg_funcs: List[ColumnExpr]#

All columns with aggregation operations

property all_cols: List[ColumnExpr]#

All columns (with inferred aliases)

assert_all_with_names()[source]#

Assert every column have explicit alias or the alias can be inferred (non empty value). It will also validate there is no duplicated aliases

Raises

ValueError – if there are columns without alias, or there are duplicated names.

Returns

the instance itself

Return type

SelectColumns

assert_no_agg()[source]#

Assert there is no aggregation operation on any column.

Raises

AssertionError – if there is any aggregation in the collection.

Returns

the instance itself

Return type

SelectColumns

See also

Go to is_agg() to see how the aggregations are detected.

assert_no_wildcard()[source]#

Assert there is no * on first level columns

Raises

AssertionError – if all_cols() exists

Returns

the instance itself

Return type

SelectColumns

property group_keys: List[ColumnExpr]#

Group keys inferred from the columns.

Note

property has_agg: bool#

Whether this select is an aggregation

property has_literals: bool#

Whether this select contains literal columns

property is_distinct: bool#

Whether this is a SELECT DISTINCT

property literals: List[ColumnExpr]#

All literal columns

property non_agg_funcs: List[ColumnExpr]#

All columns with non-aggregation operations

replace_wildcard(schema)[source]#

Replace wildcard * with explicit column names

Parameters

schema (Schema) – the schema used to parse the wildcard

Returns

a new instance containing only explicit columns

Return type

SelectColumns

Note

It only replaces the top level *. For example count_distinct(all_cols()) will not be transformed because this * is not first level.

property simple: bool#

Whether this select contains only simple column representations

property simple_cols: List[ColumnExpr]#

All columns directly representing column names