Skip to content

c05 apoio, outubro

Peter edited this page Oct 5, 2017 · 2 revisions

Lembretes

Ao usar API do PMC retornou os seguintes casos classificados como "Comment" que eram artigos de fato,

pmcid ISSN doctype date2
27733281 0140-6736 Comment 2016-10-08
27733282 0140-6736 Comment 2016-10-08
27733283 0140-6736 Comment 2016-10-08

Lixo

CREATE TABLE amostra1 (
  pmcid int,
  issn text,
  doctype text,
  date2 text
);
COPY amostra1 FROM '/tmp/c05_articles_pmid-EXPANDED.csv' WITH (FORMAT csv) HEADER;

DELETE FROM amostra1 WHERE pmcid is null OR pmcid=0; --2

SELECT count(issn) as n_issn, count(*) as n, count(distinct pmcid) from amostra1; --    531 | 1458 |  1442

CREATE TABLE amostra1_limpo AS 
 SELECT DISTINCT pmcid, issn, doctype, date2::date as date2 
 FROM amostra1
 WHERE doctype NOT IN ('Editorial'); 

select * from amostra1_limpo where doctype!='Journal Article' order by doctype, pmcid;

Casos que não eram Journal Article:

pmcid doctype date2
27028323 Clinical Trial 2016-03-30
27242166 Clinical Trial 2016-06-27
27509020 Clinical Trial 2016-08-10
27820821 Clinical Trial 2016-11-07
27887591 Clinical Trial 2016-11-25
27733281 Comment 2016-10-08
27733282 Comment 2016-10-08
27733283 Comment 2016-10-08
28090535 Editorial 2016-01-01
23410000 Evaluation Studies 2013-02-14
23809208 Evaluation Studies 2013-06-28
23967358 Evaluation Studies 2013-08-15
24148652 Evaluation Studies 2013-10-22
24603874 Evaluation Studies 2014-03-06
26641858 Evaluation Studies 2015-12-07
26910315 Evaluation Studies 2016-02-24
26962084 Evaluation Studies 2016-05-01
26986061 Evaluation Studies 2016-03-17
27102015 Evaluation Studies 2016-04-21
27314656 Evaluation Studies 2016-05-01
27532497 Evaluation Studies 2016-08-17
27549586 Evaluation Studies 2016-10-15
27618941 Evaluation Studies 2016-09-13
27768689 Evaluation Studies 2016-10-21
27768698 Evaluation Studies 2016-10-21
28116530 Evaluation Studies 2017-03-01
28153043 Evaluation Studies 2017-02-02
28183291 Evaluation Studies 2017-02-10
28183328 Evaluation Studies 2017-02-10
28339458 Evaluation Studies 2017-03-24
28542603 Evaluation Studies 2017-05-25
28651016 Evaluation Studies 2017-06-26
28719659 Evaluation Studies 2017-07-18
27564984 Historical Article 2016-09-01
27708473 Historical Article 2016-09-01
28285851 Historical Article 2017-06-01
28727821 Historical Article 2017-07-20
..37 rows ... ...

testes sql

CREATE TABLE doaj_info AS
  -- conferir porque não está completo, ver http://miar.ub.edu/issn/0021-9584 ou http://www.issn.cc/1932-6203
  --  ambos tem URLs da revista 
  SELECT CASE when is1 is not null THEN issn.n2c(is1) ELSE issn.n2c(is2) END as issnl,  
         pup_country, license, lic_attribs, license_ismachine
  FROM (
    SELECT "Journal ISSN (print version)" as is1, "Journal EISSN (online version)" as is2, 
     "Country of publisher" as pup_country, "Machine-readable CC licensing information embedded or displayed" as license_ismachine,
     "Journal license" as license, "License attributes" as lic_attribs
    FROM  tmpcsv_doaj 
  ) t
;

Ver https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=28727821


Riqueza e diversidade

  • Richness R = 310 ISSNs
  • Richness R = 123 ISSNs filtering with select issn, count(*) from amostra1_limpo group by 1 having count(*)>1 order by 2 desc

Simpson Diversity-dominance Index

Simpson's Index (D) measures the probability that two individuals randomly selected from a sample will belong to the same species

select sum(  (count(issn)::float/(select count(*) from amostra1_limpo)::float)^2  ) as "D"
from amostra1_limpo group by issn 
;

select count(*) from amostra1_limpo group by 1 ;

-- P=N::float/count(*)::float

VACUUM FULL;

-- 1.3*N/TOT ...  limit N;
-- precisa retornar mesmo tipo
CREATE or replace FUNCTION amostra1_limpo_get(bigint) RETURNS SETOF amostra1_limpo AS $$ 
   SELECT * 
   FROM amostra1_limpo 
        TABLESAMPLE BERNOULLI( ROUND(120.0*$1::float/(select count(*) from amostra1_limpo)::float) ) LIMIT $1;
$$ LANGUAGE SQL;
-- opcao dinamica. RETURN QUERY EXECUTE format(
--    'SELECT * FROM %I  ... TABLESAMPLE BERNOULLI((select count(*) from %I)) ', 
--   tname,   120, tname,    
-- );

-- simpson2

WITH t AS (
 SELECT count(issn) as n, count(issn)-1 as n1 
 FROM amostra1_limpo 
 GROUP BY issn --HAVING count(*)>1
) SELECT (SELECT sum(n*n1) from t)::float/(SELECT sum(n)*sum(n1) from t)::float as "D";

D = 0.053, ou seja 5,3% de chance de dois artigos aleatórios terem mesmo ISSN. (com count>10 sobre para 9%) considerando tudo temos 4,6% de chance.

Equitatividade de Pielou

Conceito em https://en.wikipedia.org/wiki/Species_evenness

WITH qt AS (
 SELECT count(issn) as n 
 FROM amostra1_limpo 
 GROUP BY issn -- HAVING count(*)>1
) SELECT -sum( p*ln(p) ) / ln((SELECT sum(n) FROM qt)::float) as "E"
  FROM (
   SELECT n::float/(SELECT sum(n) FROM qt)::float as p 
   FROM qt
  ) t
;  -- 0.54 com count, 0.6 = 60%

-- distribuição por ano:
select count(*) as n, count(distinct issn) as is, substr(date2::text,0,5)::int as ano from amostra1_limpo group by 3 order by 3;