@prefix ex:<https://purl.expasy.org/sparql-examples/neXtProt/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <https://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .

ex:NXQ_00126 a sh:SPARQLExecutable,
        sh:SPARQLSelectExecutable ;
    rdfs:comment "Peptides that are potential neo N-termini from undescribed isoforms"@en ;
    sh:prefixes _:sparql_examples_prefixes ;
    sh:select """PREFIX : <http://nextprot.org/rdf/>

SELECT DISTINCT ?entry ?pep WHERE {
 ?entry :isoform ?iso .
 ?iso :peptideMapping ?pm.
 ?pm :start ?p1 ; :end ?p2 .
 ?pm :proteotypic true .
 filter(?p1 > 2). # must not be already N-terminal
 ?iso :sequence / :chain ?chain.
 bind (substr(?chain, ?p1, ?p2 - ?p1 + 1) as ?pep) .
 bind (substr(?chain, ?p1-1, 1) as ?prevAA) .
 bind (substr(?chain, ?p1, 1) as ?firstAA) .
 bind (substr(?chain, ?p2, 1) as ?lastAA) .
 filter(!regex (?prevAA,'[KR]')) # must be semi-tryptic in N-ter
 filter(regex (?prevAA,'M') || regex (?firstAA,'M')) # must be N-terminal
 filter(!regex (?firstAA,'[DEFIKLRY]')) #plausible 2nd AA
 filter(regex (?lastAA,'[KR]')) # must be tryptic in C-ter
 filter not exists { # The candidate peptide must not already exist as N-ter in a described isoform
 ?entry :isoform ?iso2.
 ?iso2 :sequence / :chain ?chain2.
 ?iso2 :matureProtein [ :start ?mstart ; :end ?mend]
 bind (substr(?chain2, ?mstart, ?mend - ?mstart + 1) as ?mat2) .
 filter(strlen(?mat2) > 30).
 bind (substr(?mat2, 2, strlen(?mat2) - 1) as ?mat22) .
 filter(regex(?mat2,concat("^", ?pep)) || regex(?mat22,concat("^", ?pep))).
 }
}
order by ?pep

# overestimated, need additional filter(s)
# but for instance AELEEVTLDGKPLQALR, AELEEVTLDGKPLQALRVTDLKAALEQR and AELEEVTLDGKPLQALRVTDLKAALEQR in Q9UKV3
# are N-acetylated and good markers for an additional iso starting at M-59""" ;
    schema:keywords "isoforms",
        "peptide",
        "proteomics",
        "tutorial" ;
    schema:target <https://sparql.nextprot.org/sparql> .

