Module `jumpscale.core.base.store.whooshfts`

Expand source code

from whoosh import fields
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import FuzzyTermPlugin, GtLtPlugin, MultifieldParser, PhrasePlugin
from whoosh.writing import AsyncWriter

from . import ConfigNotFound, EncryptedConfigStore, KEY_FIELD_NAME
from .serializers import Serializer

from jumpscale.sals.fs import join_paths, mkdirs

# a map betwen our indexable fields and whoosh fields
# for now we don't support nested fields like Lists or Objects
# they will only be stored but not indexed
FIELD_MAP = {
    "Boolean": fields.BOOLEAN(stored=True),
    "Bytes": fields.TEXT(stored=True),
    "Email": fields.TEXT(stored=True),
    "GUID": fields.TEXT(stored=True),
    "IPAddress": fields.TEXT(stored=True),
    "IPRange": fields.TEXT(stored=True),
    "Json": fields.TEXT(stored=True),
    "Path": fields.ID(stored=True),
    "String": fields.TEXT(stored=True),
    "Tel": fields.TEXT(stored=True),
    "URL": fields.TEXT(stored=True),
    "Integer": fields.NUMERIC(bits=64, stored=True, sortable=True),
    "Float": fields.NUMERIC(float, bits=64, stored=True, sortable=True),
    "Port": fields.NUMERIC(stored=True, sortable=True),
    "Date": fields.NUMERIC(stored=True, sortable=True),
    "DateTime": fields.NUMERIC(stored=True, sortable=True),
    "Time": fields.NUMERIC(stored=True, sortable=True),
}

# enums and secret fields are special case,
# for enum, it can be stored as int, bool, or str values...
# so, the field type is selected based on enum values (assuming they're all of the same type)
ENUM_FIELD = "Enum"
# for secret, its field name prefixed with underscores is not valid in whoosh
# they are handled when reading/writing the data
SECRET_FIELD = "Secret"


class WhooshStore(EncryptedConfigStore):
    """
    whoosh store is an EncryptedConfigStore

    It saves and indexes the data in a whoosh index
    """

    def __init__(self, location):
        """
        create a new redis store, the location given will be used to generate keys

        this keys will be combined to get/set instance config

        Args:
            location (Location)
        """
        super().__init__(location, Serializer())
        config = self.config_env.get_store_config("whoosh")
        self.base_index_path = config["path"]

        self.schema = self.get_schema()
        self.index = self.get_index(self.schema)

        self.default_plugins = [FuzzyTermPlugin(), GtLtPlugin(), PhrasePlugin()]
        self.default_pagenum = 1
        self.default_pagelen = 20

    @property
    def index_path(self):
        path = join_paths(self.base_index_path, self.location.name)
        mkdirs(path)
        return path

    @property
    def type_fields(self):
        return self.location.type._fields.items()

    def get_schema(self):
        schema_fields = {KEY_FIELD_NAME: fields.ID(unique=True, stored=True)}

        for name, field in self.type_fields:
            field_type_name = field.__class__.__name__
            if field_type_name in FIELD_MAP:
                schema_fields[name] = FIELD_MAP[field_type_name]
            elif field_type_name == ENUM_FIELD:
                # it depends on the type of enum values
                enum_value_type = type(field.default)
                if isinstance(enum_value_type, (str, bytes, bytearray)):
                    schema_field = fields.TEXT(stored=True)
                elif isinstance(enum_value_type, (int, float)):
                    schema_field = fields.NUMERIC(stored=True)
                else:
                    schema_field = fields.STORED

                schema_fields[name] = schema_field
            else:
                schema_fields[name] = fields.STORED

        return fields.Schema(**schema_fields)

    def get_index(self, schema):
        if exists_in(self.index_path):
            return open_dir(self.index_path, schema=schema)
        return create_in(self.index_path, schema=schema)

    def get_reader(self):
        return self.index.reader()

    def get_writer(self):
        return AsyncWriter(self.index)

    def get_searcher(self, up_to_date=True):
        searcher = self.index.searcher()

        if up_to_date and not searcher.up_to_date():
            searcher = searcher.refresh()

        return searcher

    def read(self, instance_name):
        with self.get_searcher() as searcher:
            kw = {KEY_FIELD_NAME: instance_name}
            doc = searcher.document(**kw)

            if not doc:
                raise ConfigNotFound(f"cannot find config for {instance_name} in the index")

            for name, field in self.type_fields:
                # whoosh does not store None values, so, we just set them
                # if they are not set, that means when they're added, they'd the value of None
                if name not in doc and field.stored:
                    doc[name] = None

                # add __ to field name by hand
                # as we cannot add a field that starts with "__" in whoosh schema
                if field.__class__.__name__ == SECRET_FIELD:
                    name_with_prefix = f"__{name}"
                    doc[name_with_prefix] = doc[name]
                    doc.pop(name)

            return doc

    def write(self, instance_name, data):
        data[KEY_FIELD_NAME] = instance_name

        for name, field in self.type_fields:
            if field.__class__.__name__ == SECRET_FIELD:
                name_with_prefix = f"__{name}"
                if name_with_prefix in data:
                    # remove "__", as the whoosh field cannot start with it
                    # the original field name is the one used in whoosh schema
                    data[name] = data[name_with_prefix]
                    data.pop(name_with_prefix)

        writer = self.get_writer()
        writer.update_document(**data)
        writer.commit()

    def list_all(self):
        with self.get_reader() as reader:
            for _, doc in reader.iter_docs():
                yield doc[KEY_FIELD_NAME]

    def find(self, cursor_=None, limit_=None, **queries):
        fields = queries.keys()
        query_text = " ".join([f"{field}:{queries[field]}" for field in fields])

        parser = MultifieldParser(fields, schema=self.schema)
        parser.add_plugins(self.default_plugins)

        query = parser.parse(query_text)
        searcher = self.get_searcher()

        if not cursor_:
            cursor_ = self.default_pagenum
        if not limit_:
            limit_ = self.default_pagelen

        result = searcher.search_page(query, pagenum=cursor_, pagelen=limit_)

        if result.is_last_page():
            new_cursor = None
        else:
            new_cursor = result.pagenum + 1

        if result.total >= limit_:
            result = result[:limit_]

        return new_cursor, len(result), (hit for hit in result)

    def delete(self, instance_name):
        writer = self.get_writer()
        writer.delete_by_term(KEY_FIELD_NAME, instance_name)
        writer.commit()

Classes

class WhooshStore (location)

whoosh store is an EncryptedConfigStore

It saves and indexes the data in a whoosh index

create a new redis store, the location given will be used to generate keys

this keys will be combined to get/set instance config

Args

location (Location)

Expand source code

class WhooshStore(EncryptedConfigStore):
    """
    whoosh store is an EncryptedConfigStore

    It saves and indexes the data in a whoosh index
    """

    def __init__(self, location):
        """
        create a new redis store, the location given will be used to generate keys

        this keys will be combined to get/set instance config

        Args:
            location (Location)
        """
        super().__init__(location, Serializer())
        config = self.config_env.get_store_config("whoosh")
        self.base_index_path = config["path"]

        self.schema = self.get_schema()
        self.index = self.get_index(self.schema)

        self.default_plugins = [FuzzyTermPlugin(), GtLtPlugin(), PhrasePlugin()]
        self.default_pagenum = 1
        self.default_pagelen = 20

    @property
    def index_path(self):
        path = join_paths(self.base_index_path, self.location.name)
        mkdirs(path)
        return path

    @property
    def type_fields(self):
        return self.location.type._fields.items()

    def get_schema(self):
        schema_fields = {KEY_FIELD_NAME: fields.ID(unique=True, stored=True)}

        for name, field in self.type_fields:
            field_type_name = field.__class__.__name__
            if field_type_name in FIELD_MAP:
                schema_fields[name] = FIELD_MAP[field_type_name]
            elif field_type_name == ENUM_FIELD:
                # it depends on the type of enum values
                enum_value_type = type(field.default)
                if isinstance(enum_value_type, (str, bytes, bytearray)):
                    schema_field = fields.TEXT(stored=True)
                elif isinstance(enum_value_type, (int, float)):
                    schema_field = fields.NUMERIC(stored=True)
                else:
                    schema_field = fields.STORED

                schema_fields[name] = schema_field
            else:
                schema_fields[name] = fields.STORED

        return fields.Schema(**schema_fields)

    def get_index(self, schema):
        if exists_in(self.index_path):
            return open_dir(self.index_path, schema=schema)
        return create_in(self.index_path, schema=schema)

    def get_reader(self):
        return self.index.reader()

    def get_writer(self):
        return AsyncWriter(self.index)

    def get_searcher(self, up_to_date=True):
        searcher = self.index.searcher()

        if up_to_date and not searcher.up_to_date():
            searcher = searcher.refresh()

        return searcher

    def read(self, instance_name):
        with self.get_searcher() as searcher:
            kw = {KEY_FIELD_NAME: instance_name}
            doc = searcher.document(**kw)

            if not doc:
                raise ConfigNotFound(f"cannot find config for {instance_name} in the index")

            for name, field in self.type_fields:
                # whoosh does not store None values, so, we just set them
                # if they are not set, that means when they're added, they'd the value of None
                if name not in doc and field.stored:
                    doc[name] = None

                # add __ to field name by hand
                # as we cannot add a field that starts with "__" in whoosh schema
                if field.__class__.__name__ == SECRET_FIELD:
                    name_with_prefix = f"__{name}"
                    doc[name_with_prefix] = doc[name]
                    doc.pop(name)

            return doc

    def write(self, instance_name, data):
        data[KEY_FIELD_NAME] = instance_name

        for name, field in self.type_fields:
            if field.__class__.__name__ == SECRET_FIELD:
                name_with_prefix = f"__{name}"
                if name_with_prefix in data:
                    # remove "__", as the whoosh field cannot start with it
                    # the original field name is the one used in whoosh schema
                    data[name] = data[name_with_prefix]
                    data.pop(name_with_prefix)

        writer = self.get_writer()
        writer.update_document(**data)
        writer.commit()

    def list_all(self):
        with self.get_reader() as reader:
            for _, doc in reader.iter_docs():
                yield doc[KEY_FIELD_NAME]

    def find(self, cursor_=None, limit_=None, **queries):
        fields = queries.keys()
        query_text = " ".join([f"{field}:{queries[field]}" for field in fields])

        parser = MultifieldParser(fields, schema=self.schema)
        parser.add_plugins(self.default_plugins)

        query = parser.parse(query_text)
        searcher = self.get_searcher()

        if not cursor_:
            cursor_ = self.default_pagenum
        if not limit_:
            limit_ = self.default_pagelen

        result = searcher.search_page(query, pagenum=cursor_, pagelen=limit_)

        if result.is_last_page():
            new_cursor = None
        else:
            new_cursor = result.pagenum + 1

        if result.total >= limit_:
            result = result[:limit_]

        return new_cursor, len(result), (hit for hit in result)

    def delete(self, instance_name):
        writer = self.get_writer()
        writer.delete_by_term(KEY_FIELD_NAME, instance_name)
        writer.commit()

Ancestors

Instance variables

var index_path

Expand source code

@property
def index_path(self):
    path = join_paths(self.base_index_path, self.location.name)
    mkdirs(path)
    return path

var type_fields

Expand source code

@property
def type_fields(self):
    return self.location.type._fields.items()

Methods

def delete(self, instance_name)

Expand source code

def delete(self, instance_name):
    writer = self.get_writer()
    writer.delete_by_term(KEY_FIELD_NAME, instance_name)
    writer.commit()

def get_index(self, schema)

Expand source code

def get_index(self, schema):
    if exists_in(self.index_path):
        return open_dir(self.index_path, schema=schema)
    return create_in(self.index_path, schema=schema)

def get_reader(self)

Expand source code

def get_reader(self):
    return self.index.reader()

def get_schema(self)

Expand source code

def get_schema(self):
    schema_fields = {KEY_FIELD_NAME: fields.ID(unique=True, stored=True)}

    for name, field in self.type_fields:
        field_type_name = field.__class__.__name__
        if field_type_name in FIELD_MAP:
            schema_fields[name] = FIELD_MAP[field_type_name]
        elif field_type_name == ENUM_FIELD:
            # it depends on the type of enum values
            enum_value_type = type(field.default)
            if isinstance(enum_value_type, (str, bytes, bytearray)):
                schema_field = fields.TEXT(stored=True)
            elif isinstance(enum_value_type, (int, float)):
                schema_field = fields.NUMERIC(stored=True)
            else:
                schema_field = fields.STORED

            schema_fields[name] = schema_field
        else:
            schema_fields[name] = fields.STORED

    return fields.Schema(**schema_fields)

def get_searcher(self, up_to_date=True)

Expand source code

def get_searcher(self, up_to_date=True):
    searcher = self.index.searcher()

    if up_to_date and not searcher.up_to_date():
        searcher = searcher.refresh()

    return searcher

def get_writer(self)

Expand source code

def get_writer(self):
    return AsyncWriter(self.index)

def list_all(self)

Expand source code

def list_all(self):
    with self.get_reader() as reader:
        for _, doc in reader.iter_docs():
            yield doc[KEY_FIELD_NAME]

def read(self, instance_name)

Expand source code

def read(self, instance_name):
    with self.get_searcher() as searcher:
        kw = {KEY_FIELD_NAME: instance_name}
        doc = searcher.document(**kw)

        if not doc:
            raise ConfigNotFound(f"cannot find config for {instance_name} in the index")

        for name, field in self.type_fields:
            # whoosh does not store None values, so, we just set them
            # if they are not set, that means when they're added, they'd the value of None
            if name not in doc and field.stored:
                doc[name] = None

            # add __ to field name by hand
            # as we cannot add a field that starts with "__" in whoosh schema
            if field.__class__.__name__ == SECRET_FIELD:
                name_with_prefix = f"__{name}"
                doc[name_with_prefix] = doc[name]
                doc.pop(name)

        return doc

def write(self, instance_name, data)

Expand source code

def write(self, instance_name, data):
    data[KEY_FIELD_NAME] = instance_name

    for name, field in self.type_fields:
        if field.__class__.__name__ == SECRET_FIELD:
            name_with_prefix = f"__{name}"
            if name_with_prefix in data:
                # remove "__", as the whoosh field cannot start with it
                # the original field name is the one used in whoosh schema
                data[name] = data[name_with_prefix]
                data.pop(name_with_prefix)

    writer = self.get_writer()
    writer.update_document(**data)
    writer.commit()

Inherited members

EncryptedConfigStore:
- decrypt
- encrypt
- find
- get
- save