Code
--[[
PDBReader - convert PDB format
Scans, filters, and converts PDB records to
determine secondary structure and disulfide
bridges.
The relevant PDB record types are DBREF,
SEQRES, HELIX, SHEET, and SSBOND. These records
are found in the header section of a PDB file.
The recipe ignores all other PDB record types.
See the comments for PDBReader below for details.
version 0.9 -- 2019/06/09 -- LociOiling
* new recipe, cloned from NetSurfP
]]--
--
-- Globals
--
Recipe = "PDBReader"
Version = "0.9"
ReVersion = Recipe .. " v." .. Version
--
-- end of globals section
--
--
-- begin PDBreader 0.9
--
-- PDBReader - read, filter, and convert PDB entry
--
-- Reads the SEQRES, HELIX, SHEET, and SSBOND entries from a PDB file.
--
-- Filters the entries based on the specified chain id
--
-- Converts the selected entries into formats useful in Foldit.
--
-- 1 2 3 4 5 6 7 8
-- 12345678901234567890123456789012345678901234567890123456789012345678901234567890
-- arguments:
-- pdbentry - PDB file (can include entire PDB file if desired)
-- chain - chain id, normally "A", "B", "C", and so on
--
-- returns:
-- structs - table containing helix and sheet info
-- aastring - amino acids as single-character codes, suitable for AA Edit
-- sstring - secondary structure as a string, suitable for SS Edit
-- disulfides - table containing residue pairs for disulfides
--
-- The user must determine the offset between the PDB residue numbers and the Foldit
-- segment numbers in order to apply the secondary structure and disulfide info.
--
-- The user must also locate the PDB entry and copy the PDB file.
-- Foldit recipes can't directly access the online PDB.
--
-- The Foldit recipe AA Edit returns the primary structure of a protein as a
-- string of single-character amino acids codes. This format can be used for
-- the PDB at http://rcsb.org. Many other online tools also use this format.
--
-- 1 2 3 4 5 6 7 8
-- 12345678901234567890123456789012345678901234567890123456789012345678901234567890
-- Jpred, found at http://www.compbio.dundee.ac.uk/jpred/index.html, matches a
-- primary structure to PDB entries and report on the alignment. The primary
-- structure string reported by AA Edit or print protein can be input to Jpred.
-- Jpred reports matching PDB entries. Jpred reports alignment below this list
-- under "Alignment of PDB hits to your sequence". You must click on "Click to
-- show/hide details" to see the exact details of each match. The input is shown
-- as "Query", and the matching PDB entry is shown as "Sbjct". The offset is
-- the difference between the Subject and the Query.
--
-- For example, given the amino acid sequence:
--
-- vtlfvalydyearteddlsfhkgekfqilnssegdwwearslttgetgyipsnyvapvd
--
-- Jpred reports several PDB matches, with 4EIK as the first match. The
-- alignment details for 4EIK are as follows:
--
-- >4eik_A mol:protein length:64 Tyrosine-protein kinase Fyn
-- Length = 64
-- Score = 126 bits (316), Expect = 3e-29
-- Identities = 59/59 (100%), Positives = 59/59 (100%)
--
-- Query: 1 VTLFVALYDYEARTEDDLSFHKGEKFQILNSSEGDWWEARSLTTGETGYIPSNYVAPVD 59
-- VTLFVALYDYEARTEDDLSFHKGEKFQILNSSEGDWWEARSLTTGETGYIPSNYVAPVD
-- Sbjct: 5 VTLFVALYDYEARTEDDLSFHKGEKFQILNSSEGDWWEARSLTTGETGYIPSNYVAPVD 63
--
-- This means 4EIK matches the input sequence exactly. The "4eik_A", means the
-- chain ID is "A". Residue 5 from 4EIK (the "Sbjct") matches segment 1 in
-- Foldit (the "Query"), giving an offset of -4. The starting residue for
-- PDBReader is 5, and the ending residue is 63.
--
-- Jpred matching using the sequence found in the FASTA-format file associated
-- with the PDB entry. This numbering doesn't always agree with numbering in
-- actual PDB model. Inspecting the PDB file to determine the correct values
-- may be necessary to straighten out these cases.
--
-- In some cases, the PDB model may even skip residue numbers. The current
-- version of PDBReader does not address this scenario.
--
-- To obtain the input for PDBReader, go to the PDB page for your match,
-- for example:
--
-- https://www.rcsb.org/structure/4eik
--
-- Click on "Display Files" and select "PDB Format (Header)". The HELIX,
-- SHEET, and SSBOND lines are found toward the end of the header. You
-- can copy these lines and paste them into the "pdbentry" argument to
-- PDBReader. Or, you can simply copy and paste the entire header.
-- PDBReader filters out other record types.
--
--
-- indices for secondary structure table returned by PDBReader
--
SSTYPE = 1 -- type - "H" or "E"
SSSEQ1 = 2 -- segment/residue # 1
SSACD1 = 3 -- amino acid code residue #1
SSSEQ2 = 4 -- segment/residue # 1
SSACD2 = 5 -- amino acid code residue #2
function PDBReader ( pdbentry, chain )
--
-- convert three-letter abbreviations to single-letter lowercase codes
--
AACodes = {
TRP = "w",
GLY = "g",
ALA = "a",
CYS = "c",
VAL = "v",
LEU = "l",
ILE = "i",
MET = "m",
MSE = "m", -- for 1WDV and the like (SELENOMETHIONINE)
PRO = "p",
PHE = "f",
TYR = "y",
ARG = "r",
SER = "s",
THR = "t",
ASN = "n",
ASP = "d",
BFD = "d", -- for 2FTK and the like (ASPARTATE BERYLLIUM TRIFLUORIDE)
GLN = "q",
GLU = "e",
HIS = "h",
LYS = "k",
}
local function AACode ( abbrev )
local res = AACodes [ abbrev ]
if res == nil then
res = "x"
end
return res
end
local TYPENUM = 1
local TYPECHAR = 2
local STRUINDX = 1
local STRUBEG = 2
local STRUEND = 3
local STRUTYPE = 4
--
-- array indexes below identify the table output
-- produced by PDBReader for each type of PDB record
--
--
-- indexes for HELIX record
--
local HENUM = 1
local HEID = 2
local HEIRES = 3
local HEICHN = 4
local HEISEQ = 5
local HEICOD = 6
local HETRES = 7
local HETCHN = 8
local HETSEQ = 9
local HETCOD = 10
local HETYPE = 11
local HECOMM = 12
local HELENG = 13
local helixPOS = {
{ HENUM, 8, 10, TYPENUM, },
{ HEID, 12, 14, TYPECHAR, },
{ HEIRES, 16, 18, TYPECHAR, },
{ HEICHN, 20, 20, TYPECHAR, },
{ HEISEQ, 22, 25, TYPENUM, },
{ HEICOD, 26, 26, TYPECHAR, },
{ HETRES, 28, 30, TYPECHAR, },
{ HETCHN, 32, 32, TYPECHAR, },
{ HETSEQ, 34, 37, TYPENUM, },
{ HETCOD, 38, 38, TYPECHAR, },
{ HETYPE, 39, 40, TYPECHAR, },
{ HECOMM, 41, 70, TYPECHAR, },
{ HELENG, 72, 76, TYPENUM, },
}
--
-- indexes for SHEET record
--
local SHNUM = 1
local SHID = 2
local SHNSTR = 3
local SHIRES = 4
local SHICHN = 5
local SHISEQ = 6
local SHICOD = 7
local SHTRES = 8
local SHTCHN = 9
local SHTSEQ = 10
local SHTCOD = 11
local SHSENS = 12
local sheetPOS = {
{ SHNUM, 8, 10, TYPENUM, },
{ SHID, 12, 14, TYPECHAR, },
{ SHNSTR, 15, 16, TYPENUM, },
{ SHIRES, 18, 20, TYPECHAR, },
{ SHICHN, 22, 22, TYPECHAR, },
{ SHISEQ, 23, 26, TYPENUM, },
{ SHICOD, 27, 27, TYPECHAR, },
{ SHTRES, 29, 31, TYPECHAR, },
{ SHTCHN, 33, 33, TYPECHAR, },
{ SHTSEQ, 34, 37, TYPENUM, },
{ SHTCOD, 38, 38, TYPECHAR, },
{ SHSENS, 39, 40, TYPENUM, },
}
--
-- indexes for SEQRES record
--
local SRNUM = 1
local SRCHN = 2
local SRCNT = 3
local SRRES01 = 4
local SRRES02 = 5
local SRRES03 = 6
local SRRES04 = 7
local SRRES05 = 8
local SRRES06 = 9
local SRRES07 = 10
local SRRES08 = 11
local SRRES09 = 12
local SRRES10 = 13
local SRRES11 = 14
local SRRES12 = 15
local SRRES13 = 16
local seqPOS = {
{ SRNUM, 8, 10, TYPENUM, },
{ SRCHN, 12, 12, TYPECHAR, },
{ SRCNT, 14, 17, TYPENUM, },
{ SRRES01, 20, 22, TYPECHAR, },
{ SRRES02, 24, 26, TYPECHAR, },
{ SRRES03, 28, 30, TYPECHAR, },
{ SRRES04, 32, 34, TYPECHAR, },
{ SRRES05, 36, 38, TYPECHAR, },
{ SRRES06, 40, 42, TYPECHAR, },
{ SRRES07, 44, 46, TYPECHAR, },
{ SRRES08, 48, 50, TYPECHAR, },
{ SRRES09, 52, 54, TYPECHAR, },
{ SRRES10, 56, 58, TYPECHAR, },
{ SRRES11, 60, 62, TYPECHAR, },
{ SRRES12, 64, 66, TYPECHAR, },
{ SRRES13, 68, 70, TYPECHAR, },
}
--
-- indexes for DBREF record
--
local DBCOD = 1
local DBCHN = 2
local DBBEG = 3
local DBBIN = 4
local DBEND = 5
local DBEIN = 6
local DBNAM = 7
local DBACC = 8
local DBACD = 9
local DBDBG = 10
local DBDBI = 11
local DBDEN = 12
local DBDEI = 13
local dbPOS = {
{ DBCOD, 8, 11, TYPECHAR, },
{ DBCHN, 13, 13, TYPECHAR, },
{ DBBEG, 15, 18, TYPENUM, },
{ DBBIN, 19, 19, TYPECHAR, },
{ DBEND, 21, 24, TYPENUM, },
{ DBEIN, 25, 25, TYPECHAR, },
{ DBNAM, 27, 32, TYPECHAR, },
{ DBACC, 34, 41, TYPECHAR, },
{ DBACD, 43, 54, TYPECHAR, },
{ DBDBG, 56, 60, TYPENUM, },
{ DBDBI, 61, 61, TYPECHAR, },
{ DBDEN, 63, 67, TYPENUM, },
{ DBDEI, 68, 68, TYPECHAR, },
}
--
-- indexes for SSBOND record
--
local DINUM = 1
local DIIRES = 3
local DIICHN = 4
local DIISEQ = 5
local DIICOD = 6
local DITRES = 7
local DITCHN = 8
local DITSEQ = 9
local DITCOD = 10
local DIISYM = 11
local DITSYM = 12
local DIBNDL = 13
local disulfidePOS = {
{ DINUM, 8, 10, TYPENUM, },
{ DIIRES, 12, 14, TYPECHAR, },
{ DIICHN, 16, 16, TYPECHAR, },
{ DIISEQ, 18, 21, TYPENUM, },
{ DIICOD, 22, 22, TYPECHAR, },
{ DITRES, 26, 28, TYPECHAR, },
{ DITCHN, 30, 30, TYPECHAR, },
{ DITSEQ, 32, 35, TYPENUM, },
{ DITCOD, 36, 36, TYPECHAR, },
{ DIISYM, 60, 65, TYPENUM, },
{ DITSYM, 67, 72, TYPENUM, },
{ DIBNDL, 74, 78, TYPENUM, },
}
local function getArray ( line, strudef )
retarray = {}
for ii = 1, #strudef do
local field = line:sub ( strudef [ ii ] [ STRUBEG ], strudef [ ii ] [ STRUEND ] )
if strudef [ ii ] [ STRUTYPE ] == TYPENUM then
retarray [ strudef [ ii ] [ STRUINDX ] ] = tonumber ( field )
else
retarray [ strudef [ ii ] [ STRUINDX ] ] = field
end
end
return retarray
end
local sheetz = {}
local helixz = {}
local disulfidez = {}
local seqrez = {}
local dbrefz = {}
local linecnt = 0
local unknown = 0
for line in pdbentry:gmatch ( "(.-)[\n*\r*]" ) do
if line ~= nil then
line = line:match "^%s*(.-)%s*$" -- eliminate leading whitespace
if line:match ( "DBREF" ) then
dbrefz [ #dbrefz + 1 ] = getArray ( line, dbPOS )
elseif line:match ( "SEQRES" ) then
seqrez [ #seqrez + 1 ] = getArray ( line, seqPOS )
elseif line:match ( "HELIX" ) then
helixz [ #helixz + 1 ] = getArray ( line, helixPOS )
elseif line:match ( "SHEET" ) then
sheetz [ #sheetz + 1 ] = getArray ( line, sheetPOS )
elseif line:match ( "SSBOND" ) then
disulfidez [ #disulfidez + 1 ] = getArray ( line, disulfidePOS )
else
--print ( "unknown entry: \"" .. line .. "\"" )
unknown = unknown + 1
end
end
linecnt = linecnt + 1
end
print ( "number of lines = " .. linecnt )
print ( "number of DBREF records = " .. #dbrefz )
print ( "number of sequence records = " .. #seqrez )
print ( "number of helixes = " .. #helixz )
print ( "number of sheets = " .. #sheetz )
print ( "number of disulfides = " .. #disulfidez )
print ( "number of unknown entries = " .. unknown )
--
-- find DBREF for specified chain
-- take first match, there may be
-- multiples, reflecting different
-- sources
--
-- the "initial sequence number" in chnoff
-- is used for a preliminary adjustment of
-- the helix, sheet, and disulfide records
--
local chnoff = 0
for ii = 1, #dbrefz do
if dbrefz [ ii ] [ DBCHN ] == chain then
chnoff = dbrefz [ ii ] [ DBBEG ] - 1
print ( "matched DBREF chain " .. dbrefz [ ii ] [ DBCHN ] .. ", begins " .. dbrefz [ ii ] [ DBBEG ] )
break
end
end
--
-- SEQRES records have a repeating element, go from
-- the index of the first element
--
local aastring = ""
for ii = 1, #seqrez do
if seqrez [ ii ] [ SRCHN ] == chain then
for jj = SRRES01, SRRES13 do
if seqrez [ ii ] [ jj ]:match ( "[A-Z][A-Z][A-Z]" ) then
aastring = aastring .. AACode ( seqrez [ ii ] [ jj ] )
end
end
end
end
local sentry = {}
for ii = 1, aastring:len () do
sentry [ ii ] = "L"
end
print ( "selecting entries for chain \"" .. chain .. "\"" )
local aalen = aastring:len ()
local sstab = {}
local helixcnt = 0
local sheetcnt = 0
local disulfidecnt = 0
for ii = 1, #helixz do
if helixz [ ii ] [ HEICHN ] == chain
and helixz [ ii ] [ HETCHN ] == chain
and helixz [ ii ] [ HEISEQ ] - chnoff <= aalen -- check for invalid positions in PDB
and helixz [ ii ] [ HETSEQ ] - chnoff <= aalen -- check for invalid positions in PDB
then
sstab [ #sstab + 1 ] = { "H",
helixz [ ii ] [ HEISEQ ] - chnoff,
AACode ( helixz [ ii ] [ HEIRES ] ),
helixz [ ii ] [ HETSEQ ] - chnoff,
AACode ( helixz [ ii ] [ HETRES ] ),
}
helixcnt = helixcnt + 1
for ii = helixz [ ii ] [ HEISEQ ] - chnoff, helixz [ ii ] [ HETSEQ ] - chnoff do
sentry [ ii ] = "H"
end
end
end
for ii = 1, #sheetz do
--print ( "sheet " .. ii .. ", chains = " .. sheetz [ ii ] [ SHICHN ] .. ", " .. sheetz [ ii ] [ SHTCHN ] )
--print ( "sheet " .. ii .. ", residues = " .. sheetz [ ii ] [ SHISEQ ] .. " - " .. sheetz [ ii ] [ SHTSEQ ] )
--print ( "sheet " .. ii .. ", residues = " .. sheetz [ ii ] [ SHISEQ ] - chnoff .. " - " .. sheetz [ ii ] [ SHTSEQ ] - chnoff )
if sheetz [ ii ] [ SHICHN ] == chain
and sheetz [ ii ] [ SHTCHN ] == chain
and sheetz [ ii ] [ SHISEQ ] - chnoff <= aalen -- check for invalid positions in PDB
and sheetz [ ii ] [ SHTSEQ ] - chnoff <= aalen -- check for invalid positions in PDB
then
sstab [ #sstab + 1 ] = { "E",
sheetz [ ii ] [ SHISEQ ] - chnoff,
AACode ( sheetz [ ii ] [ SHIRES ] ),
sheetz [ ii ] [ SHTSEQ ] - chnoff,
AACode ( sheetz [ ii ] [ SHTRES ] ),
}
sheetcnt = sheetcnt + 1
for ii = sheetz [ ii ] [ SHISEQ ] - chnoff, sheetz [ ii ] [ SHTSEQ ] - chnoff do
sentry [ ii ] = "E"
end
end
end
local ditab = {}
for ii = 1, #disulfidez do
if disulfidez [ ii ] [ DIICHN ] == chain
and disulfidez [ ii ] [ DITCHN ] == chain
then
ditab [ #ditab + 1 ] = { disulfidez [ ii ] [ DIISEQ ] - chnoff, disulfidez [ ii ] [ DITSEQ ] - chnoff, }
disulfidecnt = disulfidecnt + 1
end
end
print ( "" )
print ( "selected " .. helixcnt .. " helixes for chain \"" .. chain .. "\"" )
print ( "selected " .. sheetcnt .. " sheets for chain \"" .. chain .. "\"" )
print ( "selected " .. disulfidecnt .. " disulfide bridges for chain \"" .. chain .. "\"" )
return sstab, ditab, aastring
end
--
-- end PDBReader 0.9
--
--
-- PDBReader 0.9 - helper functions
--
function makeruler ( sBeg, sEnd )
local function tenpart ( ff )
if ff >= 100 then
ff = ff % 100
end
return ( ff - ( ff % 10 ) ) / 10
end
local function hunpart ( ff )
if ff >= 1000 then
ff = ff % 1000
end
return ( ff - ( ff % 100 ) ) / 100
end
local onez = ""
local tenz = ""
local hunz = ""
local numz = 1
for ii = sBeg, sEnd do
onez = onez .. numz % 10
if ii % 10 == 0 then
tenz = tenz .. tenpart ( ii )
if ii >= 100 then
hunz = hunz .. hunpart ( ii )
else
hunz = hunz .. " "
end
else
if ii == sBeg and ii > 1 then
tenz = tenz .. tenpart ( ii )
hunz = hunz .. hunpart ( ii )
else
tenz = tenz .. " "
hunz = hunz .. " "
end
end
if ii == segCnt2 then
tenz = tenz:sub ( 1, tenz:len () - 1 ) .. tenpart ( ii )
hunz = hunz:sub ( 1, hunz:len () - 1 ) .. hunpart ( ii )
end
numz = numz + 1
if numz > 10 then
numz = 1
end
end
local ruler = ""
if sEnd >= 100 then
ruler = hunz .. "\n"
end
if sEnd >= 10 then
ruler = ruler .. tenz .. "\n"
end
ruler = ruler .. onez
return ruler
end
function pStruct ( structz, maxseg )
local ssout = "structs = {\n"
for ii = 1, #structz do
if maxseg == nil
or ( structz [ ii ] [ SSSEQ1 ] <= maxseg
and structz [ ii ] [ SSSEQ2 ] <= maxseg )
then
ssout = ssout .. " { \"" ..
structz [ ii ] [ SSTYPE ] .. "\", " ..
structz [ ii ] [ SSSEQ1 ] .. ", \"" ..
structz [ ii ] [ SSACD1 ] .. "\", " ..
structz [ ii ] [ SSSEQ2 ] .. ", \""..
structz [ ii ] [ SSACD2 ] .. "\"," ..
" },\n"
end
end
ssout = ssout .. "}"
return ssout
end
function pBridges ( disulfidez, maxseg )
local diout = ""
for ii = 1, #disulfidez do
if maxseg == nil
or ( disulfidez [ ii ] [ 1 ] <= maxseg
and disulfidez [ ii ] [ 2 ] <= maxseg )
then
if ii > 1 then
diout = diout .. " "
end
diout = diout .. disulfidez [ ii ] [ 1 ] .. "," ..disulfidez [ ii ] [ 2 ]
end
end
return diout
end
function checkStructs ( structz, aastring, errstack )
local serrs = 0
for ii = 1, #structz do
local aa1 = aastring:sub ( structz [ ii ] [ SSSEQ1 ], structz [ ii ] [ SSSEQ1 ] )
local aa2 = aastring:sub ( structz [ ii ] [ SSSEQ2 ], structz [ ii ] [ SSSEQ2 ] )
local ra1 = structz [ ii ] [ SSACD1 ]
local ra2 = structz [ ii ] [ SSACD2 ]
if aa1 ~= ra1 then
errstack [ #errstack + 1 ] = "ERROR: structure " .. ii .. ", AA " .. aa1 .. " at " .. structz [ ii ] [ SSSEQ1 ] .. " doesn't match " .. ra1
serrs = serrs + 1
end
if aa2 ~= ra2 then
errstack [ #errstack + 1 ] = "ERROR: structure " .. ii .. ", AA " .. aa2 .. " at " .. structz [ ii ] [ SSSEQ2 ] .. " doesn't match " .. ra2
serrs = serrs + 1
end
end
return serrs
end
function checkBridges ( ditab, aastring, errstack )
local serrs = 0
for ii = 1, #ditab do
local aa1 = aastring:sub ( ditab [ ii ] [ 1 ], ditab [ ii ] [ 1 ] )
local aa2 = aastring:sub ( ditab [ ii ] [ 2 ], ditab [ ii ] [ 2 ] )
if aa1 ~= "c" then
errstack [ #errstack + 1 ] = "ERROR: bridge " .. ii .. ", AA " .. aa1 .. " at " .. ditab [ ii ] [ 1 ] .. " isn't cysteine"
serrs = serrs + 1
end
if aa2 ~= "c" then
errstack [ #errstack + 1 ] = "ERROR: bridge " .. ii .. ", AA " .. aa2 .. " at " .. ditab [ ii ] [ 2 ] .. " isn't cysteine"
serrs = serrs + 1
end
end
return serrs
end
function adjustStructs ( structz, aastring, offset, errstack )
--
-- offset table
--
-- column 1 contains possible offsets
-- gathered through trial and error
-- (not super scientific)
--
-- column 2 contains the number of
-- structures that matched the offset
--
-- columns 3 and 4 are the min and
-- max residue matching the offset
--
--
local modez = {}
if offset ~= 0 then
modez [ #modez + 1 ] = { offset, 0, 9999, -1, }
modez [ #modez + 1 ] = { -offset, 0, 9999, -1, }
end
if offset ~= 1 then
modez [ #modez + 1 ] = { -1, 0, 9999, -1, }
end
if offset ~= -1 then
modez [ #modez + 1 ] = { 1, 0, 9999, -1, }
end
if math.abs ( offset ) > 1 then
modez [ #modez + 1 ] = { offset + -1, 0, 9999, -1, }
modez [ #modez + 1 ] = { -offset + 1, 0, 9999, -1, }
end
modez [ #modez + 1 ] = { 0, 0, 9999, -1, }
--[[
local modez = {
{ 0, 0, 9999, -1, },
{ offset, 0, 9999, -1, },
{ -offset, 0, 9999, -1, },
{ -1, 0, 9999, -1, },
{ 1, 0, 9999, -1, },
{ offset + -1, 0, 9999, -1, },
{ -offset + 1, 0, 9999, -1, },
}
--]]
--
-- offset table offsets
--
local MODOFF = 1
local MODCNT = 2
local MODMIN = 3
local MODMAX = 4
local function modal ( stru, modez, aastring )
--
-- raa1 and raa2 are the AAs specified in the PDB
--
local raa1 = stru [ SSACD1 ]
local raa2 = stru [ SSACD2 ]
--
-- unadjusted residues from the PDB
--
local seq1 = stru [ SSSEQ1 ]
local seq2 = stru [ SSSEQ2 ]
--
-- try each value in the modez table
-- until a match is found
--
local match = false
local modidx = 1
for ii = 1, #modez do
local tsq1 = seq1 + modez [ ii ] [ MODOFF ]
local taa1 = aastring:sub ( tsq1, tsq1 )
local tsq2 = seq2 + modez [ ii ] [ MODOFF ]
local taa2 = aastring:sub ( tsq2, tsq2 )
if taa1 == raa1
and taa2 == raa2
then
match = true
modidx = ii
modez [ ii ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], math.min ( seq1, seq2 ) )
modez [ ii ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], math.max ( seq1, seq2 ) )
modez [ ii ] [ MODCNT ] = modez [ ii ] [ MODCNT ] + 1
stru [ SSSEQ1 ] = tsq1
stru [ SSSEQ2 ] = tsq2
elseif taa1 == raa1 then -- first residue matches, try different fixups on second
for jj = 1, #modez do
if jj ~= ii then -- don't try the one that already flunked
local tsq2a = seq2 + modez [ jj ] [ MODOFF ]
local taa2a = aastring:sub ( tsq2a, tsq2a )
if taa2a == raa2 then
match = true
modidx = ii
modez [ ii ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], seq1 )
modez [ ii ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], seq1 )
modez [ jj ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], seq2 )
modez [ jj ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], seq2 )
modez [ ii ] [ MODCNT ] = modez [ ii ] [ MODCNT ] + 0.5
modez [ jj ] [ MODCNT ] = modez [ jj ] [ MODCNT ] + 0.5
stru [ SSSEQ1 ] = tsq1
stru [ SSSEQ2 ] = tsq2a
errstack [ #errstack + 1 ] =
"WARNING: residue "
.. seq1 ..
" adjusted by "
.. modez [ ii ] [ MODOFF ] ..
", residue "
.. seq2 ..
" adjusted by "
.. modez [ jj ] [ MODOFF ]
break
end
end
end
if match then
break
end
for jj = tsq1 + 1, seq2 - 1 do
local taa2a = aastring:sub ( jj, jj )
if taa2a == raa2 then
match = true
modidx = ii
modez [ ii ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], seq1 )
modez [ ii ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], seq1 )
modez [ ii ] [ MODCNT ] = modez [ ii ] [ MODCNT ] + 0.5
stru [ SSSEQ1 ] = tsq1
stru [ SSSEQ2 ] = jj
errstack [ #errstack + 1 ] =
"WARNING: residue "
.. seq1 ..
" adjusted by "
.. modez [ ii ] [ MODOFF ] ..
", residue "
.. seq2 ..
" adjusted by "
.. jj - seq2
break
end
end
end
if match then
break
end
end
return match, modidx
end
local serrs = 0
for ii = 1, #structz do
local stru = structz [ ii ]
--[[
print ( "adjusting structure "
.. ii ..
", type = \""
.. stru [ SSTYPE ] ..
"\", start = "
.. stru [ SSSEQ1 ] ..
", end = "
.. stru [ SSSEQ2 ]
)
]]--
local match, modidx = modal ( stru, modez, aastring )
if match then
print ( "structure "
.. ii ..
", type = \""
.. stru [ SSTYPE ] ..
"\", adjusted by "
.. modez [ modidx ] [ MODOFF ]
)
else
errstack [ #errstack + 1 ] = "ERROR: structure " .. ii .. ", no valid adjustment found"
serrs = serrs + 1
end
end
local bestcnt = 0
local bestmode = 0
for ii = 1, #modez do
if modez [ ii ] [ MODCNT ] > 0 then
print ( "adjusted "
.. modez [ ii ] [ MODCNT ] ..
" structures by "
.. modez [ ii ] [ MODOFF ] ..
", min residue = "
.. modez [ ii ] [ MODMIN ] ..
", max residue = "
.. modez [ ii ] [ MODMAX ]
)
end
end
return serrs
end
function adjustBridges ( ditab, aastring, offset, errstack )
--
-- offset table
--
-- column 1 contains possible offsets
-- gathered through trial and error
-- (not super scientific)
--
-- column 2 contains the number of
-- structures that matched the offset
--
-- columns 3 and 4 are the min and
-- max residue matching the offset
--
--
local modez = {}
if offset ~= 0 then
modez [ #modez + 1 ] = { offset, 0, 9999, -1, }
modez [ #modez + 1 ] = { -offset, 0, 9999, -1, }
end
if offset ~= 1 then
modez [ #modez + 1 ] = { -1, 0, 9999, -1, }
end
if offset ~= -1 then
modez [ #modez + 1 ] = { 1, 0, 9999, -1, }
end
if math.abs ( offset ) > 1 then
modez [ #modez + 1 ] = { offset + -1, 0, 9999, -1, }
modez [ #modez + 1 ] = { -offset + 1, 0, 9999, -1, }
end
modez [ #modez + 1 ] = { 0, 0, 9999, -1, }
--[[
local modez = {
{ 0, 0, 9999, -1, },
{ offset, 0, 9999, -1, },
{ -offset, 0, 9999, -1, },
{ -1, 0, 9999, -1, },
{ 1, 0, 9999, -1, },
{ offset + -1, 0, 9999, -1, },
{ -offset + 1, 0, 9999, -1, },
}
--]]
--
-- offset table offsets
--
local MODOFF = 1
local MODCNT = 2
local MODMIN = 3
local MODMAX = 4
local function modal ( bridge, modez, aastring )
--
-- unadjusted residues from the PDB
--
local seq1 = bridge [ 1 ]
local seq2 = bridge [ 2 ]
--
-- try each value in the modez table
-- until a match is found
--
local match = false
local modidx = 1
for ii = 1, #modez do
local tsq1 = seq1 + modez [ ii ] [ MODOFF ]
local taa1 = aastring:sub ( tsq1, tsq1 )
local tsq2 = seq2 + modez [ ii ] [ MODOFF ]
local taa2 = aastring:sub ( tsq2, tsq2 )
if taa1 == "c"
and taa2 == "c"
then
match = true
modidx = ii
modez [ ii ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], math.min ( seq1, seq2 ) )
modez [ ii ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], math.max ( seq1, seq2 ) )
modez [ ii ] [ MODCNT ] = modez [ ii ] [ MODCNT ] + 1
bridge [ 1 ] = tsq1
bridge [ 2 ] = tsq2
break
elseif taa1 == "c" then -- first residue matches, try different fixups on second
for jj = 1, #modez do
if jj ~= ii then -- don't try the one that already flunked
local tsq2a = seq2 + modez [ jj ] [ MODOFF ]
local taa2a = aastring:sub ( tsq2a, tsq2a )
if taa2a == "c" then
match = true
modidx = ii
modez [ ii ] [ MODMIN ] = math.min ( modez [ ii ] [ MODMIN ], seq1 )
modez [ ii ] [ MODMAX ] = math.max ( modez [ ii ] [ MODMAX ], seq1 )
modez [ jj ] [ MODMIN ] = math.min ( modez [ jj ] [ MODMIN ], seq2 )
modez [ jj ] [ MODMAX ] = math.max ( modez [ jj ] [ MODMAX ], seq2 )
modez [ ii ] [ MODCNT ] = modez [ ii ] [ MODCNT ] + 0.5
modez [ jj ] [ MODCNT ] = modez [ jj ] [ MODCNT ] + 0.5
bridge [ 1 ] = tsq1
bridge [ 2 ] = tsq2a
errstack [ #errstack + 1 ] =
"WARNING: residue "
.. seq1 ..
" adjusted by "
.. modez [ ii ] [ MODOFF ] ..
", residue "
.. seq2 ..
" adjusted by "
.. modez [ jj ] [ MODOFF ]
--serrs = serr + 1
break
end
end
end
end
if match then
break
end
end
return match, modidx
end
local serrs = 0
for ii = 1, #ditab do
local bridge = ditab [ ii ]
local match, modidx = modal ( bridge, modez, aastring )
if match then
print ( "bridge "
.. ii ..
" adjusted by "
.. modez [ modidx ] [ MODOFF ]
)
else
errstack [ #errstack + 1 ] = "ERROR: bridge " .. ii .. ", no valid adjustment found"
serrs = serrs + 1
end
end
local bestcnt = 0
local bestmode = 0
for ii = 1, #modez do
if modez [ ii ] [ MODCNT ] > 0 then
print ( "adjusted "
.. modez [ ii ] [ MODCNT ] ..
" bridges by "
.. modez [ ii ] [ MODOFF ] ..
", residue 1 = "
.. modez [ ii ] [ MODMIN ] ..
", residue 2 = "
.. modez [ ii ] [ MODMAX ]
)
end
end
return serrs
end
--
-- generate string from stucture array
--
-- do not assume numbering starts at 1,
-- PDBs may start numbering at 0, and
-- even negative numbers are allowed
--
function struString ( structz, plen )
local ss = {}
local minseq = 999999
local maxseq = -999999
for ii = 1, #structz do
for jj = structz [ ii ] [ SSSEQ1 ], structz [ ii ] [ SSSEQ2 ] do
ss [ jj ] = structz [ ii ] [ SSTYPE ]
if jj < minseq then
minseq = jj
end
if jj > maxseq then
maxseq = jj
end
end
end
local tlen = maxseq - minseq + 1
for ii = minseq, maxseq do
if ss [ ii ] == nil then
ss [ ii ] = "L"
end
end
if minseq > 1 then
for ii = 1, minseq - 1 do
ss [ ii ] = "L"
if ii < minseq then
minseq = ii
end
tlen = tlen + 1
end
end
if tlen < plen then
local dlen = plen - tlen
local mxsq2 = maxseq
for ii = maxseq + 1, maxseq + dlen do
ss [ ii ] = "L"
mxsq2 = mxsq2 + 1
end
maxseq = mxsq2
end
local ssstr = ""
for ii = minseq, maxseq do
ssstr = ssstr .. ss [ ii ]
end
return ssstr
end
--
-- end PDBReader 0.9 - helper functions
--
function GetPDB ( )
local dlog = dialog.CreateDialog ( ReVersion )
local pdb = ""
local chain = "A"
local offset = "0"
dlog.pdb = dialog.AddTextbox ( "PDB info", pdb )
dlog.u0 = dialog.AddLabel ( "" )
dlog.u1 = dialog.AddLabel ( "Usage: copy the header of the PDB file" )
dlog.u2 = dialog.AddLabel ( "and paste into the PDB info box." )
dlog.u3 = dialog.AddLabel ( "The DBREF, SEQRES, HELIX, SHEET, and SSBOND" )
dlog.u4 = dialog.AddLabel ( "records are used, others are ignored." )
dlog.w0 = dialog.AddLabel ( "" )
dlog.chain = dialog.AddTextbox ( "chain", chain )
dlog.w5 = dialog.AddLabel ( "" )
dlog.ok = dialog.AddButton ( "OK" , 1 )
dlog.exit = dialog.AddButton ( "Exit" , 0 )
if ( dialog.Show ( dlog ) > 0 ) then
pdb = dlog.pdb.value
pdb = pdb .. "\n"
chain = dlog.chain.value
else
pdb = ""
chain = ""
end
return pdb, chain
end
function ApplySS ( sstring )
print ( "" )
print ( "---updating secondary structure---" )
local changes = 0
local oldss = ""
for ii = 1, sstring:len() do
local ss0 = structure.GetSecondaryStructure ( ii )
oldss = oldss .. ss0
local ss1 = sstring:sub ( ii, ii )
if ss0 ~= ss1 then
structure.SetSecondaryStructure ( ii, ss1 )
changes = changes + 1
end
end
print ( changes .. " segments changed" )
if changes > 0 then
print ( "" )
print ( "---original secondary structure---" )
print ( oldss )
print ( "---updated secondary structure---" )
print ( sstring )
end
end
function Matched ( curseq, matidx, structz, sstring, disulfidez, aastring )
--
-- curseq - primary structure string for current Foldit protein
-- matidx - index of match between curseq and aastring
-- structz - structure table returned by PDBReader
-- sstring - secondary structure string generated from structz
-- disulfidez - disulfide bridge table returned by PDBReader
-- aastring - primary structure string returned by PDBReader
--
local ssout = ""
local diout = ""
local curlen = curseq:len()
local offset = 1 - matidx
print ( "---adjusted results---" )
print ( "" )
if aastring:len () > 0 then
print ( "" )
print ( "---primary sequence (FASTA format)---" )
local matend = matidx + curlen - 1
aastring = aastring:sub ( matidx, matend )
print ( makeruler ( 1, curlen ) )
print ( aastring )
print ( "---Foldit primary sequence (for comparison)---" )
print ( curseq )
end
print ( "" )
--
-- check structure array, see if it matches adjusted string
--
local errstack = {}
local serrs = checkStructs ( structz, aastring, errstack )
if serrs > 0 then
print ( "structure array does not match, adjusting" )
errstack = {}
serrs = adjustStructs ( structz, aastring, offset, errstack )
for ii = 1, #errstack do
print ( errstack [ ii ] )
end
errstack = {}
else
print ( "structure array matches AA sequence, not adjusting" )
end
print ( "" )
if #structz > 0 then
print ( "---secondary stucture table---" )
ssout = pStruct ( structz, curlen )
print ( ssout )
end
sstring = struString ( structz, curlen )
if sstring:len () > 0 then
print ( "" )
print ( "---secondary structure string---" )
print ( makeruler ( 1, curlen ) )
print ( sstring )
end
if #disulfidez > 0 then
local errstack = {}
local serrs = checkBridges ( disulfidez, aastring, errstack )
if serrs > 0 then
print ( "disulfide bridges do not match, adjusting" )
errstack = {}
serrs = adjustBridges ( disulfidez, aastring, offset, errstack )
for ii = 1, #errstack do
print ( errstack [ ii ] )
end
errstack = {}
else
print ( "disulfide bridges match AA sequence, not adjusting" )
end
print ( "" )
print ( "---disulfide bridges---" )
diout = pBridges ( disulfidez )
print ( diout )
print ( "" )
end
local dlog = dialog.CreateDialog ( ReVersion )
dlog.tab0 = dialog.AddLabel ( "PDB exactly matches current protein" )
dlog.offs1 = dialog.AddLabel ( "PDB residue - Foldit segment = " .. offset )
dlog.sp1 = dialog.AddLabel ( "" )
dlog.csv = dialog.AddTextbox ( "structs", ssout )
dlog.ssp = dialog.AddTextbox ( "SS string", sstring )
if #disulfidez > 0 then
dlog.ssc = dialog.AddTextbox ( "disulfides", diout )
end
dlog.u0 = dialog.AddLabel ( "" )
dlog.u1 = dialog.AddLabel ( "structs respresents the structures as a Lua table" )
dlog.u2 = dialog.AddLabel ( "SS string is the format used by SS Edit and print protein" )
dlog.u4 = dialog.AddLabel ( "" )
if #disulfidez > 0 then
dlog.u5 = dialog.AddLabel ( "disulfides shows bridges in Bridge Wiggle format")
end
dlog.v1 = dialog.AddLabel ( "Usage: use select all and copy" )
dlog.w0 = dialog.AddLabel ( "" )
dlog.w1 = dialog.AddLabel ( "Windows: ctrl + a = select all" )
dlog.w3 = dialog.AddLabel ( "Windows: ctrl + c = copy" )
dlog.z0 = dialog.AddLabel ( "" )
dlog.z2 = dialog.AddLabel ( "Apply copies the SS string to the protein" )
dlog.ok = dialog.AddButton ( "Apply" , 1 )
dlog.apply = dialog.AddButton ( "Cancel", 0 )
local ret = 0
ret = dialog.Show ( dlog )
if ret >0 then
ApplySS ( sstring )
else
print ( "No changes saved" )
end
return structz, sstring, aastring
end
function DoConvert ( offset, len, structz, sstring, disulfidez, aastring )
local ssout = ""
local diout = ""
local matidx = -offset + 1
local matend = matidx + len - 1
print ( "" )
print ( "---adjusted results for non-matching PDB entry---" )
local aalen = aastring:len ()
if aalen > 0 then
print ( "" )
print ( "---primary sequence (FASTA format)---" )
print ( makeruler ( 1, len ) )
aastring = aastring:sub ( matidx, matend )
aalen = aastring:len ()
print ( aastring )
end
print ( "" )
if #structz > 0 then
print ( "---secondary stucture table---" )
for ii = 1, #structz do
structz [ ii ] [ SSSEQ1 ] = structz [ ii ] [ SSSEQ1 ] + offset
structz [ ii ] [ SSSEQ2 ] = structz [ ii ] [ SSSEQ2 ] + offset
end
ssout = pStruct ( structz )
print ( ssout )
end
if sstring:len () > 0 then
print ( "" )
print ( "---secondary structure string---" )
print ( makeruler ( 1, len ) )
sstring = struString ( structz, len )
sstring = sstring:sub ( matidx, matend )
print ( sstring )
end
if #disulfidez > 0 then
print ( "" )
print ( "---disulfide bridges---" )
for ii = 1, #disulfidez do
disulfidez [ ii ] [ 1 ] = disulfidez [ ii ] [ 1 ] + offset
disulfidez [ ii ] [ 2 ] = disulfidez [ ii ] [ 2 ] + offset
end
diout = pBridges ( disulfidez )
print ( diout )
end
end
function Nomatch ( curseq, structz, sstring, disulfidez, aastring )
local offset = 0
local len = 0
local dlog = dialog.CreateDialog ( ReVersion )
dlog.tab0 = dialog.AddLabel ( "PDB doesn't match current protein" )
dlog.tab1 = dialog.AddLabel ( "(" .. ReVersion .. " only detects exact matches)" )
dlog.fit = dialog.AddTextbox ( "Foldit", curseq )
dlog.pdb = dialog.AddTextbox ( "PDB ", aastring )
dlog.tab2 = dialog.AddLabel ( "If desired, enter custom offset and length" )
dlog.tab3 = dialog.AddLabel ( "to use with the PDB protein. The results" )
dlog.tab4 = dialog.AddLabel ( "can be used with another recipe." )
dlog.tab5 = dialog.AddLabel ( "Offset is the PDB residue number minus" )
dlog.tab6 = dialog.AddLabel ( "the Foldit segment number. Length is the" )
dlog.tab7 = dialog.AddLabel ( "length of the target protein in Foldit." )
dlog.tab8 = dialog.AddLabel ( "The output appears in the scriptlog file." )
dlog.offset = dialog.AddTextbox ( "offset", offset )
dlog.len = dialog.AddTextbox ( "length", len )
dlog.ok = dialog.AddButton ( "Convert" , 1 )
dlog.cancel = dialog.AddButton ( "Cancel", 0 )
local res = 0
res = dialog.Show ( dlog )
if res > 0 then
offset = dlog.offset.value
len = tonumber ( dlog.len.value )
DoConvert ( offset, len, structz, sstring, disulfidez, aastring )
end
end
--
-- generate string from stucture array
--
-- do not assume numbering starts at 1,
-- some PDBs start numbering at 0
--
-- this method allows for negative numbering,
-- not yet observed by the author
--
function struString ( structz, plen )
local ss = {}
local minseq = 999999
local maxseq = -999999
for ii = 1, #structz do
for jj = structz [ ii ] [ SSSEQ1 ], structz [ ii ] [ SSSEQ2 ] do
ss [ jj ] = structz [ ii ] [ SSTYPE ]
if jj < minseq then
minseq = jj
end
if jj > maxseq then
maxseq = jj
end
end
end
local tlen = maxseq - minseq + 1
for ii = minseq, maxseq do
if ss [ ii ] == nil then
ss [ ii ] = "L"
end
end
if minseq > 1 then
for ii = 1, minseq - 1 do
ss [ ii ] = "L"
if ii < minseq then
minseq = ii
end
tlen = tlen + 1
end
end
if tlen < plen then
local dlen = plen - tlen
local mxsq2 = maxseq
for ii = maxseq + 1, maxseq + dlen do
ss [ ii ] = "L"
mxsq2 = mxsq2 + 1
end
maxseq = mxsq2
end
local ssstr = ""
for ii = minseq, maxseq do
ssstr = ssstr .. ss [ ii ]
end
return ssstr
end
function main ()
print ( ReVersion )
print ( "Puzzle: " .. puzzle.GetName () )
print ( "Track: " .. ui.GetTrackName () )
local pdb = ""
local offset = 0
local ires = 0
local tres = 0
local rmax = structure.GetCount ()
local curseq = ""
for ii = 1, rmax do
if structure.GetSecondaryStructure ( ii ) ~= "M" then
curseq = curseq .. structure.GetAminoAcid ( ii )
end
end
print ( "---Foldit sequence---" )
print ( makeruler ( 1, curseq:len () ) )
print ( curseq )
print ( "" )
pdb, chain = GetPDB ( rmax )
print ( "length of PDB input = " .. pdb:len () )
if pdb:len () > 0 then
--
-- calculate the starting and ending PDB residues
-- based on the length of the Foldit protein
--
ires = 1 + offset
tres = rmax + offset
local structz = ""
local sstring = ""
local disufidez = ""
local aastring = ""
structz, disulfidez, aastring = PDBReader ( pdb, chain )
sstring = struString ( structz, aastring:len () )
print ( "" )
print ( "---unadjusted results---" )
if #structz > 0
and sstring ~= nil and sstring:len () > 0 then
print ( "" )
print ( "---secondary structure table---" )
print ( pStruct ( structz ) )
print ( "" )
print ( "---secondary structure string---" )
print ( sstring )
if #disulfidez > 0 then
print ( "" )
print ( "---disulfide bridges---" )
print ( pBridges ( disulfidez ) )
end
if aastring ~= nil and aastring:len () > 0 then
print ( "" )
print ( "---amino acid string (FASTA format)---" )
print ( aastring )
end
else
print ( "no results, input format may be wrong" )
end
local matched = false
if aastring ~= nil and aastring:len () > 0 then
local idx = aastring:find ( curseq, 1, true )
if idx ~= nil then
print ( "" )
print ( "current Foldit protein matches PDB" )
print ( "PDB offset = " .. 1 - idx )
print ( "(offset is PDB residue - Foldit segment)" )
print ( "" )
Matched ( curseq, idx, structz, sstring, disulfidez, aastring )
matched = true
else
print ( "PDB does not match current protein" )
end
else
print ( "no sequence information found in PDB input" )
end
if not matched then
Nomatch ( curseq, structz, sstring, disulfidez, aastring )
end
end
cleanup ()
end
function cleanup ( errmsg )
--
-- do not loop if cleanup causes an error
--
if CLEANUPENTRY ~= nil then
return
end
CLEANUPENTRY = true
print ( "---" )
--
-- model 100 - print recipe name, puzzle, track, time, score, and gain
--
local reason
local start, stop, line, msg
if errmsg == nil then
reason = "complete"
else
--
-- model 120 - civilized errmsg reporting,
-- thanks to Bruno K. and Jean-Bob
--
start, stop, line, msg = errmsg:find ( ":(%d+):%s()" )
if msg ~= nil then
errmsg = errmsg:sub ( msg, #errmsg )
end
if errmsg:find ( "Cancelled" ) ~= nil then
reason = "cancelled"
else
reason = "error"
end
end
print ( ReVersion .. " " .. reason )
print ( "Puzzle: " .. puzzle.GetName () )
print ( "Track: " .. ui.GetTrackName () )
if reason == "error" then
print ( "Unexpected error detected" )
print ( "Error line: " .. line )
print ( "Error: \"" .. errmsg .. "\"" )
end
end
xpcall ( main, cleanup )