examples/spouse/app.ddlog

## Random variable to predict #################################################

# This application's goal is to predict whether a given pair of person mention
# are indicating a spouse relationship or not.
@extraction
has_spouse?(
    @key
    @references(relation="person_mention", column="mention_id", alias="p1")
    p1_id text,
    @key
    @references(relation="person_mention", column="mention_id", alias="p2")
    p2_id text
).

## Input Data #################################################################

@source
articles(
    @key
    @distributed_by
    id      text,
    @searchable
    content text
).

## NLP markup #################################################################
@source
sentences(
    @key
    @distributed_by
    # XXX This breaks the search index.  @source should not be derived from another @source
    #@references(relation="articles", column="id")
    doc_id         text,
    @key
    sentence_index int,
    @search_type("text[]")
    tokens         json,
    @search_type("text[]")
    lemmas         json,
    @search_type("text[]")
    pos_tags       json,
    @search_type("text[]")
    ner_tags       json,
    @search_type("int[]")
    doc_offsets    json,
    @search_type("text[]")
    dep_types      json,
    @search_type("int[]")
    dep_tokens     json
).

function nlp_markup over (
        doc_id  text,
        content text
    ) returns rows like sentences
    implementation "udf/nlp_markup.sh" handles tsj lines.

sentences += nlp_markup(doc_id, content) :-
    articles(doc_id, content).


## Candidate mapping ##########################################################
@extraction
person_mention(
    @key
    mention_id     text,
    @searchable
    mention_text   text,
    @distributed_by
    @references(relation="sentences", column="doc_id",         alias="appears_in")
    doc_id         text,
    @references(relation="sentences", column="sentence_index", alias="appears_in")
    sentence_index int,
    begin_index    int,
    end_index      int
).

function map_person_mention over (
        doc_id         text,
        sentence_index int,
        tokens         text[],
        ner_tags       text[]
    ) returns rows like person_mention
    implementation "udf/map_person_mention.py" handles tsj lines.

person_mention += map_person_mention(
    doc_id, sentence_index, tokens, ner_tags
) :-
    sentences(doc_id, sentence_index, tokens, _, _, ner_tags, _, _, _).

spouse_candidate(
    p1_id   text,
    p1_name text,
    p2_id   text,
    p2_name text
).

num_people(doc_id, sentence_index, COUNT(p)) :-
    person_mention(p, _, doc_id, sentence_index, _, _).

spouse_candidate(p1, p1_name, p2, p2_name) :-
    num_people(same_doc, same_sentence, num_p),
    person_mention(p1, p1_name, same_doc, same_sentence, p1_begin, _),
    person_mention(p2, p2_name, same_doc, same_sentence, p2_begin, _),
    num_p < 5,
    p1 < p2,
    p1_name != p2_name,
    p1_begin != p2_begin.


## Feature Extraction #########################################################

# Feature extraction (using DDLIB via a UDF) at the relation level
@extraction
spouse_feature(
    @key
    @references(relation="has_spouse", column="p1_id", alias="has_spouse")
    p1_id   text,
    @key
    @references(relation="has_spouse", column="p2_id", alias="has_spouse")
    p2_id   text,
    @key
    feature text
).

function extract_spouse_features over (
        p1_id          text,
        p2_id          text,
        p1_begin_index int,
        p1_end_index   int,
        p2_begin_index int,
        p2_end_index   int,
        doc_id         text,
        sent_index     int,
        tokens         text[],
        lemmas         text[],
        pos_tags       text[],
        ner_tags       text[],
        dep_types      text[],
        dep_tokens     int[]
    ) returns rows like spouse_feature
    implementation "udf/extract_spouse_features.py" handles tsj lines.

spouse_feature += extract_spouse_features(
    p1_id, p2_id, p1_begin_index, p1_end_index, p2_begin_index, p2_end_index,
    doc_id, sent_index, tokens, lemmas, pos_tags, ner_tags, dep_types, dep_tokens
) :-
    person_mention(p1_id, _, doc_id, sent_index, p1_begin_index, p1_end_index),
    person_mention(p2_id, _, doc_id, sent_index, p2_begin_index, p2_end_index),
    sentences(doc_id, sent_index, tokens, lemmas, pos_tags, ner_tags, _, dep_types, dep_tokens).
## Distant Supervision ########################################################

@extraction
spouse_label(
    @key
    @references(relation="has_spouse", column="p1_id", alias="has_spouse")
    p1_id   text,
    @key
    @references(relation="has_spouse", column="p2_id", alias="has_spouse")
    p2_id   text,
    @navigable
    label   int,
    @navigable
    rule_id text
).

# make sure all pairs in spouse_candidate are considered as unsupervised examples
spouse_label(p1,p2, 0, NULL) :- spouse_candidate(p1, _, p2, _).

# distant supervision using data from DBpedia

@source
spouses_dbpedia(
    @key
    person1_name text,
    @key
    person2_name text
).

spouse_label(p1,p2, 1, "from_dbpedia") :-
    spouse_candidate(p1, p1_name, p2, p2_name),
    spouses_dbpedia(n1, n2),
    [ lower(n1) = lower(p1_name), lower(n2) = lower(p2_name) ;
      lower(n2) = lower(p1_name), lower(n1) = lower(p2_name) ].


# supervision by heuristic rules in a UDF
function supervise over (
        p1_id text, p1_begin int, p1_end int,
        p2_id text, p2_begin int, p2_end int,
        doc_id         text,
        sentence_index int,
        sentence_text  text,
        tokens         text[],
        lemmas         text[],
        pos_tags       text[],
        ner_tags       text[],
        dep_types      text[],
        dep_tokens     int[]
    ) returns (
        p1_id text, p2_id text, label int, rule_id text
    )
    implementation "udf/supervise_spouse.py" handles tsj lines.

spouse_label += supervise(
    p1_id, p1_begin, p1_end,
    p2_id, p2_begin, p2_end,
    doc_id, sentence_index,
    tokens, lemmas, pos_tags, ner_tags, dep_types, dep_token_indexes
) :-
    spouse_candidate(p1_id, _, p2_id, _),
    person_mention(p1_id, p1_text, doc_id, sentence_index, p1_begin, p1_end),
    person_mention(p2_id, p2_text,      _,              _, p2_begin, p2_end),
    sentences(
        doc_id, sentence_index,
        tokens, lemmas, pos_tags, ner_tags, _, dep_types, dep_token_indexes
    ).


# resolve multiple labels by majority vote (summing the labels in {-1,0,1})
spouse_label_resolved(p1_id, p2_id, SUM(vote)) :- spouse_label(p1_id, p2_id, vote, rule_id).

# assign the resolved labels for the spouse relation
@materialize
has_spouse(p1_id, p2_id) = if l > 0 then TRUE
                      else if l < 0 then FALSE
                      else NULL end :- spouse_label_resolved(p1_id, p2_id, l).

## Inference Rules ############################################################

# Features
@weight(f)
has_spouse(p1_id, p2_id) :-
    spouse_feature(p1_id, p2_id, f).

# Inference rule: Symmetry
@weight(3.0)
has_spouse(p1_id, p2_id) => has_spouse(p2_id, p1_id) :-
    TRUE.

# Inference rule: Only one marriage
@weight(-1.0)
has_spouse(p1_id, p2_id) => has_spouse(p1_id, p3_id) :-
    TRUE.