Code
--[[
Pucker Picker
Sometimes experiments end up with "missing residues", which are usually amino acids
that can't be located in the electron density cloud. (There's a chance that the same
thing could happen with RNA or DNA bases, but this has not been seen in Foldit.)
Usually, Foldit puzzles handle missing residues correctly. In effect, a gap terminates
one chain and starts a new chain.
Sometimes, a Foldit puzzle connects the segments on either side of the gap. This creates
a "pucker", a distortion in the structure of the protein. The two segments involved in the
pucker will have large negative ideality subscores, and their alpha carbons will be
further apart than normal. (Or the phosphorus atoms for bases.)
Pucker Picker peruses for puckers and produces a prompt report of possible puckered places.
Pucker Picker picks the segments on either side of a pucker. (Otherwise known as a selection.)
Pucker Picker should be run from the starting pose of a puzzle. Normal Foldit handling
tends to eliminate the worst effects of a pucker. Eventually the sides of the pucker
may appear to be one chain again, based on distances.
Sadly, there's no way to truly correct a true pucker. The entire protein will end up being
somewhat distorted, calling the scientific results of the puzzle into question.
Pucker Picker isn't perfect. It may sometimes detect puckers that aren't there.
For protein, the distance between alpha carbons is typically 4 Angstroms or less.
For proteins, the ends of two protein chains may sometimes be relatively close together,
without being bonded.
Protein can also be pulled into an unnatural configuration in Foldit. It's possible to get
a distance of over 50 Angstroms. Segments with that type of spacing will have very bad
ideality, so Pucker Picker will catch the long "straws" that sometimes appear.
Pucker Picker uses custom minimum and maximum distances:
* for protein a distance between 4.176 and 5.5 is considered a possible pucker
* for DNA, a distance between 8 and 14.89 is considered a possible pucker
Anything less than the minimum distance is considered a normal chain. Anything greater than or
equal to the maximum distance is considered to be two separate chains.
See https://foldit.fandom.com/wiki/Pucker on the Foldit wiki for more.
version 3.0 - LociOiling - 20250524 RC1
* cloned from Find the Chains 2.0
* starting at version 3.0 to match "print protein" and related recipes
]]--
Recipe = "Pucker Picker"
Version = "3.0 RC1"
ReVersion = Recipe .. " " .. Version
protNfo = { -- protNfo--protNfo--protNfo--protNfo--protNfo--protNfo--protNfo
--[[
protNfo package version 0.7
protNfo is packaged as a psuedo-class or psuedo-module
containing a mix of data fields and functions
all entries must be terminated with a comma to keep Lua happy
the commas aren't necessary if only function definitions are present
versions
--------
0.3 - add chain detection from AA Edit 2.0
0.4 - add ligand detection from GetSeCount
0.4 - merges in the ligand logic from GetSeCount
0.5 - still a work in progress
0.6 - integrate AminoAcids table
0.7 - trim the info collected, remove atom count logic
]]--
--
-- AminoAcids
--
-- names and key properties of all known amino acids and nucleobases
--
-- Notes:
--
-- * commented entries (at the end) are not in Foldit
-- * one-letter amino acid code is the table key
-- * two-letter RNA and DNA nucleotides are also valid
-- * the fields in this table are now referenced by name
-- * the "unk" and "x" codes are considered protein, unless the segment is marked as
-- ligand in the secondary structure ( code "M" )
-- * acref is atom count mid-chain, used to detect multiple peptide chains
--
AminoAcids = {
a = { code = "a", ctype = "P", acref = 10, short = "Ala", long = "Alanine", hydrop = 1.8 },
c = { code = "c", ctype = "P", acref = 11, short = "Cys", long = "Cysteine", hydrop = 2.5 },
d = { code = "d", ctype = "P", acref = 12, short = "Asp", long = "Aspartate", hydrop = -3.5 },
e = { code = "e", ctype = "P", acref = 15, short = "Glu", long = "Glutamate", hydrop = -3.5 },
f = { code = "f", ctype = "P", acref = 20, short = "Phe", long = "Phenylalanine", hydrop = 2.8 },
g = { code = "g", ctype = "P", acref = 7, short = "Gly", long = "Glycine", hydrop = -0.4 },
h = { code = "h", ctype = "P", acref = 17, short = "His", long = "Histidine", hydrop = -3.2 },
i = { code = "i", ctype = "P", acref = 19, short = "Ile", long = "Isoleucine", hydrop = 4.5 },
k = { code = "k", ctype = "P", acref = 22, short = "Lys", long = "Lysine", hydrop = -3.9 },
l = { code = "l", ctype = "P", acref = 19, short = "Leu", long = "Leucine", hydrop = 3.8 },
m = { code = "m", ctype = "P", acref = 17, short = "Met", long = "Methionine ", hydrop = 1.9 },
n = { code = "n", ctype = "P", acref = 14, short = "Asn", long = "Asparagine", hydrop = -3.5 },
p = { code = "p", ctype = "P", acref = 15, short = "Pro", long = "Proline", hydrop = -1.6 },
q = { code = "q", ctype = "P", acref = 17, short = "Gln", long = "Glutamine", hydrop = -3.5 },
r = { code = "r", ctype = "P", acref = 24, short = "Arg", long = "Arginine", hydrop = -4.5 },
s = { code = "s", ctype = "P", acref = 11, short = "Ser", long = "Serine", hydrop = -0.8 },
t = { code = "t", ctype = "P", acref = 14, short = "Thr", long = "Threonine", hydrop = -0.7 },
v = { code = "v", ctype = "P", acref = 16, short = "Val", long = "Valine", hydrop = 4.2 },
w = { code = "w", ctype = "P", acref = 24, short = "Trp", long = "Tryptophan", hydrop = -0.9 },
y = { code = "y", ctype = "P", acref = 21, short = "Tyr", long = "Tyrosine", hydrop = -1.3 },
--
-- codes for ligands or modified amino acids
--
x = { code = "x", ctype = "P", acref = 0, short = "Xaa", long = "Unknown", hydrop = 0 },
unk = { code = "x", ctype = "P", acref = 0, short = "Xaa", long = "Unknown", hydrop = 0 },
--
-- bonus! RNA nucleotides
--
ra = { code = "a", ctype = "R", acref = 33, short = "a", long = "Adenine", hydrop = 0, },
rc = { code = "c", ctype = "R", acref = 31, short = "c", long = "Cytosine", hydrop = 0, },
rg = { code = "g", ctype = "R", acref = 34, short = "g", long = "Guanine", hydrop = 0, },
ru = { code = "u", ctype = "R", acref = 30, short = "u", long = "Uracil", hydrop = 0, },
--
-- bonus! DNA nucleotides
--
da = { code = "a", ctype = "D", acref = 0, short = "a", long = "Adenine", hydrop = 0, },
dc = { code = "c", ctype = "D", acref = 0, short = "c", long = "Cytosine", hydrop = 0, },
dg = { code = "g", ctype = "D", acref = 0, short = "g", long = "Guanine", hydrop = 0, },
dt = { code = "t", ctype = "D", acref = 0, short = "t", long = "Thymine", hydrop = 0, },
--
-- dusty attic! musty cellar! jumbled boxroom!
-- can't bear to part with these treasures
--
-- b = { code = "b", ctype = "P", acref = 10, short = "Asx", long = "Asparagine/Aspartic acid", hydrop = 0 },
-- j = { code = "j", ctype = "P", acref = 10, short = "Xle", long = "Leucine/Isoleucine", hydrop = 0 },
-- o = { code = "o", ctype = "P", acref = 10, short = "Pyl", long = "Pyrrolysine", hydrop = 0 },
-- u = { code = "u", ctype = "P", acref = 10, short = "Sec", long = "Selenocysteine", hydrop = 0 },
-- z = { code = "z", ctype = "P", acref = 10, short = "Glx", long = "Glutamine or glutamic acid", hydrop = 0 } ,
},
aalist = {}, -- list of AA codes
rnalist = {}, -- list of RNA codes
dnalist = {}, -- list DNA codes
Ctypes = {
P = "protein",
D = "DNA",
R = "RNA",
M = "ligand",
},
PROTEIN = "P",
LIGAND = "M",
RNA = "R",
DNA = "D",
UNKNOWN_AA = "x",
UNKNOWN_BASE = "xx",
HELIX = "H",
SHEET = "E",
LOOP = "E",
segCnt = 0, -- unadjusted segment count
segCnt2 = 0, -- segment count adjusted for terminal ligands
aa = {}, -- amino acid codes
ss = {}, -- secondary structure codes
ACRF = 4.0, -- alpha carbon reference distance (protein)
PRF = 8.00, -- phosphorus reference distance (RNA/DNA)
acdx = {}, -- alpha carbon distance
ctype = {}, -- segment type - P, M, R, D
first = {}, -- true if segment is first in chain
last = {}, -- true if segment is last in chain
fastac = {}, -- external code for FASTA-style output
short = {}, -- short name
long = {}, -- long name
chainid = {}, -- chain id
chainpos = {}, -- position in chain
chains = {}, -- summary of chains
ligands = {}, -- ligand table
DEBUG = false,
round = function ( ii )
return ii - ii % 0.001
end,
--
-- get a chain id - works for A through ZZ, after that returns "??"
--
getchid = function ( ndx )
local chainid = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" }
local chmod = ( ndx - 1 ) % #chainid
local chid = chainid [ chmod + 1 ]
local chquo = math.floor ( ( ndx - 1 ) / #chainid )
if chquo > 0 then
if chquo + 1 <= #chainid then
chid = chainid [ chquo + 1 ] .. chid
else
chid = "!!"
end
end
return chid
end,
getChains = function ( self )
--
-- getChains - build a table of the chains found
--
-- Most Foldit puzzles contain only a single protein (peptide) chain.
-- A few puzzles contain ligands, and some puzzles have had two
-- protein chains. Foldit puzzles may also contain RNA or DNA.
--
-- For proteins, the atom count can be used to identify the first
-- (N terminal) and last (C terminal) ends of the chain. The AminoAcids
-- table has the mid-chain atom counts for each amino acid.
--
-- Cysteine is a special case, since the presence of a disulfide
-- bridge also changes the atom count.
--
-- For DNA and RNA, the beginning and end of the chain is determined
-- by context at present. For example, if the previous segment was protein
-- and this segment is DNA, it's the start of a chain.
--
-- Each ligand is treated as a chain of its own, with a length of 1.
--
-- chain table entries
-- -------------------
--
-- ctype - chain type - "P" for protein, "M" for ligand, "R" for RNA, "D" for DNA
-- fasta - FASTA-format sequence, single-letter codes (does not include FASTA header)
-- start - Foldit segment number of sequence start
-- stop - Foldit segment number of sequence end
-- len - length of sequence
-- chainid - chain id assigned to entry, "A", "B", "C", and so on
--
-- For DNA and RNA, fasta contains single-letter codes, so "a" for adenine.
-- The codes overlap the amino acid codes (for example, "a" for alanine).
-- The DNA and RNA codes must be converted to the appropriate two-letter codes Foldit
-- uses internally, for example "ra" for RNA adenine and "da" for DNA adenine.
--
local chainz = {}
local chindx = 0
local curchn = nil
for ii = 1, self.segCnt do
if self.first [ ii ] then
chindx = chindx + 1
chainz [ chindx ] = {}
curchn = chainz [ chindx ]
curchn.ctype = self.ctype [ ii ]
curchn.fasta = ""
curchn.ss = ""
curchn.start = ii
curchn.stop = ii
curchn.chainid = protNfo.getchid ( chindx )
curchn.len = 0
end
curchn.fasta = curchn.fasta .. self.fastac [ ii ]
curchn.ss = curchn.ss .. self.ss [ ii ]
self.chainid [ #self.chainid + 1 ] = curchn.chainid
self.chainpos [ #self.chainpos + 1 ] = ii - curchn.start + 1
if self.last [ ii ] then
curchn.stop = ii
curchn.len = curchn.stop - ( curchn.start - 1 )
end
end
return chainz
end,
getLigands = function ( self )
--
-- ultra-paranoid method for detecting ligands
--
-- each ligand segment is treated separately in this version
--
local ligandz = {}
for ii = 1, self.segCnt do
if self.ss [ ii ] == "M" then
local atoms = structure.GetAtomCount ( ii )
local rots = rotamer.GetCount ( ii )
local sscor = current.GetSegmentEnergyScore ( ii )
ligandz [ #ligandz + 1 ] = { seg = ii, atoms = atoms, rots = rots, score = sscor }
end
end
if self.DEBUG then
print ( #ligandz .. " ligands" )
for jj = 1, #ligandz do
print ( "ligand # "
.. jj ..
", segment = "
.. ligandz [ jj ].seg ..
", atoms = "
.. ligandz [ jj ].atoms ..
", rotamers = "
.. ligandz [ jj ].rots ..
", score = "
.. self.round ( ligandz [ jj ].score )
)
if ligandz [ jj ].seg < self.segCnt2 then
print ( "WARNING: non-standard ligand at segment "
.. ligandz [ jj ].seg ..
", most ligand-aware recipes won't work properly" )
end
end
end
return ligandz
end,
setNfo = function ( self )
self.segCnt = structure.GetCount()
--
-- standard ligand adjustment
--
self.segCnt2 = self.segCnt
while self.ss [ self.segCnt2 ] == "M" do
self.segCnt2 = self.segCnt2 - 1
end
if self.segCnt2 == self.segCnt then
print ( "segment count = " .. self.segCnt )
else
print ( "original segment count = " .. self.segCnt )
print ( "adjusted segment count = " .. self.segCnt2 )
end
--
-- partition AminoAcids for display purposes
--
for key, value in pairs ( protNfo.AminoAcids ) do
if value.ctype == self.PROTEIN then
self.aalist [ #self.aalist + 1 ] = key
elseif value.ctype == self.RNA then
self.rnalist [ #self.rnalist + 1 ] = key
elseif value.ctype == self.DNA then
self.dnalist [ #self.dnalist + 1 ] = key
end
end
--
-- initial scan - retrieve basic info from Foldit and AminoAcids table
--
for ii = 1, self.segCnt do
self.aa [ #self.aa + 1 ] = structure.GetAminoAcid ( ii )
self.ss [ #self.ss + 1 ] = structure.GetSecondaryStructure ( ii )
--
-- look it up
--
local aatab = self.AminoAcids [ self.aa [ ii ] ]
if aatab ~= nil then
self.ctype [ #self.ctype + 1 ] = aatab.ctype
--
-- even the codes 'x' or 'unk' are considered protein
-- unless the secondary structure is "M"
--
-- this handles glycosylated amino acids
-- in puzzles 879, 1378b, and similar
--
-- segment 134 in puzzle 879 is the example,
-- it's no longer asparagine, but it is part of
-- the peptide chain
--
if self.ss [ ii ] == self.LIGAND then
self.ctype [ ii ] = self.LIGAND
end
--
-- other info
--
else
--
-- special case: unknown code - mark it as ligand
--
-- this should not occur, but just in case
--
self.ctype [ #self.ctype + 1 ] = self.LIGAND
aa = self.UNKNOWN_AA -- a known unknown
aatab = self.AminoAcids [ aa ]
end
--
-- get distance
--
if ii < self.segCnt then
protNfo.acdx [ #protNfo.acdx + 1 ] = structure.GetDistance ( ii, ii + 1 )
else
protNfo.acdx [ #protNfo.acdx + 1 ] = 10000
end
--
-- save values from amino acids table
--
self.short [ #self.short + 1 ] = aatab.short
self.long [ #self.long + 1 ] = aatab.long
self.fastac [ #self.fastac + 1 ] = aatab.code
self.first [ #self.first + 1 ] = false
self.last [ #self.last + 1 ] = false
end -- end of initial scan
--
-- to determine first and last in chain for all types,
-- based on change in type (control break)
--
for ii = 1, self.segCnt do
if ii == 1 then
self.first [ ii ] = true
elseif ii == self.segCnt then
self.last [ ii ] = true
else
if self.ctype [ ii ] ~= self.ctype [ ii - 1 ] then
self.first [ ii ] = true
end
if self.ctype [ ii ] ~= self.ctype [ ii + 1 ] then
self.last [ ii ] = true
end
end
if self.ctype [ ii ] == self.LIGAND then
self.first [ ii ] = true
self.last [ ii ] = true
end
if self.first [ ii ] and self.DEBUG then
print ( "chain start at segment " .. ii .. ", type = " .. self.Ctypes [ self.ctype [ ii ] ] )
end
if self.last [ ii ] and self.DEBUG then
print ( "chain end at segment " .. ii .. ", type = " .. self.Ctypes [ self.ctype [ ii ] ] )
end
end
--
-- look for chain breaks based on distances
--
for ii = 1, self.segCnt do
local stype = self.ctype [ ii ] -- type of this segment
local gref = 0 -- gap reference distance
if stype == self.PROTEIN then
gref = self.ACRF
elseif stype == self.DNA then
gref = self.PRF
elseif stype == self.RNA then
gref = self.PRF
end
--
-- up until last segment
--
if ii < self.segCnt then
if self.ctype [ ii + 1 ] == stype then
if self.acdx [ ii ] > gref then
self.last [ ii ] = true
if self.DEBUG then
print ( "chain end at " .. ii .. " due to gap" )
end
end
end
end
--
-- after first segment
--
if ii > 1 then
if self.ctype [ ii - 1 ] == stype then
if self.acdx [ ii - 1 ] > gref then
self.first [ ii ] = true
if self.DEBUG then
print ( "chain start at " .. ii .. " due to gap" )
end
end
end
end
end
--
-- summarize the chain info
--
self.chains = self:getChains ()
--
-- get the ligand info
--
self.ligands = self:getLigands ()
end,
} -- protNfo--protNfo--protNfo--protNfo--protNfo--protNfo--protNfo
function main ()
print ( ReVersion )
print ( "--" )
-- protNfo.DEBUG = true
protNfo:setNfo ()
local chainz = protNfo.chains
print ( "--" )
local puckerz = {}
--
-- list all chains
--
local chw = " chain"
if #chainz > 1 then
chw = chw .. "s"
end
print ( #chainz .. chw )
for cc = 1, #chainz do
print ( "chain "
.. chainz [ cc ].chainid ..
" ("
.. protNfo.Ctypes [ chainz [ cc ].ctype ] ..
"), start = "
.. chainz [ cc ].start ..
", end = "
.. chainz [ cc ].stop ..
", length = "
.. chainz [ cc ].len
)
--
-- peek for a pucker post this chain and pre the next
--
if cc < #chainz then
--
-- chains must be the same type
--
if chainz [ cc ].ctype == chainz [ cc + 1 ].ctype then
local distmin = 0
local distmax = 0
if chainz [ cc ].ctype == protNfo.PROTEIN then
distmin = protNfo.ACRF + 0.176 -- empirical fudge factor
distmax = protNfo.ACRF + 1.5 -- empirical fudge factor
else
distmin = protNfo.PRF
distmax = 14.89 -- empirical fudge factor
end
if distmin ~= 0 then -- don't worry about ligands or whatever
local pstart = chainz [ cc ].stop
local pstop = chainz [ cc + 1 ].start
local pdist = protNfo.acdx [ pstart ]
local ptype = protNfo.Ctypes [ chainz [ cc ].ctype ]
local pmode = ""
local istart = current.GetSegmentEnergySubscore ( pstart, "Ideality" )
local istop = current.GetSegmentEnergySubscore ( pstop, "Ideality" )
if pdist > distmin and pdist <= distmax then
pmode = "distance"
end
if istart < -500 and istop < -500 then -- empirical threshold
if pmode:len () == 0 then
pmode = "ideality"
else
pmode = pmode .. "\/ideality"
end
end
if pmode:len () > 0 then
puckerz [ #puckerz + 1 ] = { mode = pmode, start = pstart, stop = pstop, istart = istart, istop = istop, dist = pdist, type = ptype, }
end
end
end
end
end
print ( "--" )
print ( ReVersion )
local plines = {}
if #puckerz > 0 then
selection.DeselectAll ()
local ptemp = puzzle.GetName ()
print ( ptemp )
plines [ #plines + 1 ] = ptemp
local pword = "pucker"
if #puckerz > 1 then
pword = "puckers"
end
ptemp = #puckerz .. " " .. pword .. " found!"
print ( ptemp )
plines [ #plines + 1 ] = ptemp
for ii = 1, #puckerz do
--
-- long version for scriptlog
--
local ptemp =
"pucker " .. ii .. " (" .. puckerz [ ii ].mode .. "), segments " .. puckerz [ ii ].start .. "-" .. puckerz [ ii ].stop ..
" (" .. puckerz [ ii ].type .. ")" ..
", distance = " .. protNfo.round ( puckerz [ ii ].dist ) ..
", ideality = " .. protNfo.round ( puckerz [ ii ].istart ) .. ", " .. protNfo.round (puckerz [ ii ].istop )
print ( ptemp )
--
-- short version for screen
--
local ptemp =
"pucker " .. ii .. " (" .. puckerz [ ii ].mode .. "), at " .. puckerz [ ii ].start .. "-" .. puckerz [ ii ].stop ..
" (" .. puckerz [ ii ].type .. ")"
plines [ #plines + 1 ] = ptemp
--
selection.SelectRange ( puckerz [ ii ].start, puckerz [ ii ].stop ) -- not much of a range, but...
end
else
local ptemp = puzzle.GetName ()
print ( ptemp )
plines [ #plines + 1 ] = ptemp
ptemp = "No puckers found!"
print ( ptemp )
plines [ #plines + 1 ] = ptemp
end
local d = dialog.CreateDialog ( ReVersion .. " - pucker report" )
d.l0 = dialog.AddLabel ( "" )
for ii = 1, #plines do
d [ "line" .. ii ] = dialog.AddLabel ( plines [ ii ] )
end
d.l1 = dialog.AddLabel ( "" )
d.l2 = dialog.AddLabel ( "See scriptlog for additional details" )
d.OK = dialog.AddButton ( "OK", 1 )
dialog.Show ( d )
cleanup ()
end
function cleanup ( errmsg ) -- cleanup v0.3
if CLEANUPENTRY ~= nil then
return
end
CLEANUPENTRY = true
print ( "---" )
local reason
local start, stop, line, msg
if errmsg == nil then
reason = "complete"
else
start, stop, line, msg = errmsg:find ( ":(%d+):%s()" )
if msg ~= nil then
errmsg = errmsg:sub ( msg, #errmsg )
end
if errmsg:find ( "Cancelled" ) ~= nil then
reason = "cancelled"
else
reason = "error"
end
end
print ( ReVersion .. " " .. reason )
if reason == "error" then
print ( "Unexpected error detected" )
print ( "Error line: " .. line )
print ( "Error: \"" .. errmsg .. "\"" )
end
end
xpcall ( main, cleanup )